diff --git a/Paper Reading Note/CRT_Chain-INFOCOM'18/1524747449744.png b/Paper Reading Note/CRT_Chain-INFOCOM'18/1524747449744.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/CRT_Chain-INFOCOM'18/1524753088554.png b/Paper Reading Note/CRT_Chain-INFOCOM'18/1524753088554.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/CRT_Chain-INFOCOM'18/1524908286158.png b/Paper Reading Note/CRT_Chain-INFOCOM'18/1524908286158.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/CRT_Chain-INFOCOM'18/1525441410699.png b/Paper Reading Note/CRT_Chain-INFOCOM'18/1525441410699.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/CRT_Chain-INFOCOM'18/CRT_Chain.md b/Paper Reading Note/CRT_Chain-INFOCOM'18/CRT_Chain.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/DDP-ICDCS'18/DDP-ICDCS'18.md b/Paper Reading Note/DDP-ICDCS'18/DDP-ICDCS'18.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/FADE-ICC'16/1520838349163.png b/Paper Reading Note/FADE-ICC'16/1520838349163.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/FADE-ICC'16/1520861458179.png b/Paper Reading Note/FADE-ICC'16/1520861458179.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/FADE-ICC'16/FADE.md b/Paper Reading Note/FADE-ICC'16/FADE.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/FADE-ICC'16/flow_selection_algorithm.png b/Paper Reading Note/FADE-ICC'16/flow_selection_algorithm.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDNtraceroute-HotSDN'14/1520995111174.png b/Paper Reading Note/SDNtraceroute-HotSDN'14/1520995111174.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDNtraceroute-HotSDN'14/1520997877920.png b/Paper Reading Note/SDNtraceroute-HotSDN'14/1520997877920.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDNtraceroute-HotSDN'14/SDNtraceroute.md b/Paper Reading Note/SDNtraceroute-HotSDN'14/SDNtraceroute.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDProber-SOSR'18/1521122343477.png b/Paper Reading Note/SDProber-SOSR'18/1521122343477.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDProber-SOSR'18/1521277960477.png b/Paper Reading Note/SDProber-SOSR'18/1521277960477.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDProber-SOSR'18/1521284130008.png b/Paper Reading Note/SDProber-SOSR'18/1521284130008.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/SDProber-SOSR'18/SDProber.md b/Paper Reading Note/SDProber-SOSR'18/SDProber.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/The Quantcast File System-VLDB'13/1523365611372.png b/Paper Reading Note/The Quantcast File System-VLDB'13/1523365611372.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/The Quantcast File System-VLDB'13/1523504685432.png b/Paper Reading Note/The Quantcast File System-VLDB'13/1523504685432.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/The Quantcast File System-VLDB'13/QFS.md b/Paper Reading Note/The Quantcast File System-VLDB'13/QFS.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/Track-CloudNet'17/1524660411325.png b/Paper Reading Note/Track-CloudNet'17/1524660411325.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/Track-CloudNet'17/1524660530235.png b/Paper Reading Note/Track-CloudNet'17/1524660530235.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/Track-CloudNet'17/1524661117396.png b/Paper Reading Note/Track-CloudNet'17/1524661117396.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/Track-CloudNet'17/1524663396210.png b/Paper Reading Note/Track-CloudNet'17/1524663396210.png
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/Track-CloudNet'17/Track.md b/Paper Reading Note/Track-CloudNet'17/Track.md
old mode 100644
new mode 100755
diff --git a/Paper Reading Note/desktop.ini b/Paper Reading Note/desktop.ini
old mode 100644
new mode 100755
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
index 0fcb259..181ec6a
--- a/README.md
+++ b/README.md
@@ -39,18 +39,20 @@ In this repo, it records some paper related to storage system, including **Data
2. *dedupv1: Improving Deduplication Throughput using Solid State Drives (SSD)*----MSST'10 ([link](https://ieeexplore.ieee.org/document/5496992)) [summary](https://yzr95924.github.io/paper_summary/dedupv1-MSST'10.html)
3. *Extreme Binning: Scalable, Parallel Deduplication for Chunk-based File Backup*----MASCOTS'09 ([link](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.467.1985&rep=rep1&type=pdf)) [summary](https://yzr95924.github.io/paper_summary/ExtremeBining-MASCOTS'09.html)
4. *Sparse Indexing: Large Scale, Inline Deduplication Using Sampling and Locality*----FAST'09 ([link](https://pdfs.semanticscholar.org/6585/e111960d2b170bb6677865b73b6d1f27d71a.pdf)) [summary](yzr95924.github.io/paper_summary/SparseIndex-FAST'09.html)
-5. *Building a High-performance Deduplication System*----USENIX ATC'11 ([link]( https://www.usenix.org/legacy/event/atc11/tech/final_files/GuoEfstathopoulos.pdf )) [summary]( https://yzr95924.github.io/paper_summary/Dedup-ATC'11.html )
-6. *Primary Data Deduplication - Large Scale Study and System Design*----USENIX ATC'12 ([link]( https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf ))
-7. *Storage Efficiency Opportunities and Analysis for Video Repositories*----HotStorage'15
-8. *Venti: A New Approach to Archival Storage*----FAST'02 ([link](https://www.usenix.org/legacy/publications/library/proceedings/fast02/quinlan/quinlan.pdf))
-9. *ChunkStash: Speeding up Inline Storage Deduplication using Flash Memory*----USENIX ATC'10 ([link](https://www.usenix.org/legacy/events/atc10/tech/full_papers/Debnath.pdf))
-10. *Data Domain Cloud Tier: Backup here, Backup there, Deduplicated Everywhere!*----USENIX ATC'19 ([link](https://www.usenix.org/system/files/atc19-duggal.pdf)) [summary]( https://yzr95924.github.io/paper_summary/CloudTier-ATC'19.html )
-11. *SmartDedup: Optimizing Deduplication for Resource-constrained Devices*----USENIX ATC'19 ([link](https://www.usenix.org/system/files/atc19-yang-qirui.pdf))
+5. *SiLo: A Similarity-Locality based Near-Exact Deduplication Scheme with Low RAM Overhead and High Throughput*----USENIX ATC'11 ([link](https://www.usenix.org/legacy/event/atc11/tech/final_files/Xia.pdf))
+6. *Building a High-performance Deduplication System*----USENIX ATC'11 ([link]( https://www.usenix.org/legacy/event/atc11/tech/final_files/GuoEfstathopoulos.pdf )) [summary]( https://yzr95924.github.io/paper_summary/Dedup-ATC'11.html )
+7. *Primary Data Deduplication - Large Scale Study and System Design*----USENIX ATC'12 ([link]( https://www.usenix.org/system/files/conference/atc12/atc12-final293.pdf ))
+8. *Storage Efficiency Opportunities and Analysis for Video Repositories*----HotStorage'15
+9. *Venti: A New Approach to Archival Storage*----FAST'02 ([link](https://www.usenix.org/legacy/publications/library/proceedings/fast02/quinlan/quinlan.pdf))
+10. *ChunkStash: Speeding up Inline Storage Deduplication using Flash Memory*----USENIX ATC'10 ([link](https://www.usenix.org/legacy/events/atc10/tech/full_papers/Debnath.pdf))
+11. *Data Domain Cloud Tier: Backup here, Backup there, Deduplicated Everywhere!*----USENIX ATC'19 ([link](https://www.usenix.org/system/files/atc19-duggal.pdf)) [summary]( https://yzr95924.github.io/paper_summary/CloudTier-ATC'19.html )
+12. *SmartDedup: Optimizing Deduplication for Resource-constrained Devices*----USENIX ATC'19 ([link](https://www.usenix.org/system/files/atc19-yang-qirui.pdf))
13. Can't We All Get Along? Redesigning Protection Storage for Modern Workloads----USENIX ATC'18 ([link](https://www.usenix.org/system/files/conference/atc18/atc18-allu.pdf)) [summary](https://yzr95924.github.io/paper_summary/Redesigning-ATC'18.html)
14. *Deduplication in SSDs: Model and quantitative analysis*----MSST'12 ([link](https://ieeexplore.ieee.org/document/6232379))
-14. *Cumulus: Filesystem Backup to the Cloud*----FAST'09 ([link](https://www.usenix.org/legacy/event/fast09/tech/full_papers/vrable/vrable.pdf))
-15. *iDedup: Latency-aware, Inline Data Deduplication for Primary Storage*----FAST'12 ([link]( https://www.usenix.org/legacy/event/fast12/tech/full_papers/Srinivasan.pdf ))
-16. *DupHunter: Flexible High-Performance Deduplication for Docker Registries*----USENIX ATC'20 ([link](https://www.usenix.org/system/files/atc20-zhao.pdf))
+15. *Cumulus: Filesystem Backup to the Cloud*----FAST'09 ([link](https://www.usenix.org/legacy/event/fast09/tech/full_papers/vrable/vrable.pdf))
+16. *iDedup: Latency-aware, Inline Data Deduplication for Primary Storage*----FAST'12 ([link]( https://www.usenix.org/legacy/event/fast12/tech/full_papers/Srinivasan.pdf ))
+17. *DupHunter: Flexible High-Performance Deduplication for Docker Registries*----USENIX ATC'20 ([link](https://www.usenix.org/system/files/atc20-zhao.pdf))
+18. Design Tradeoffs for Data Deduplication Performance in Backup Workloads----FAST'15 ([link](https://www.usenix.org/system/files/conference/fast15/fast15-paper-fu.pdf)) [summary](https://yzr95924.github.io/paper_summary/DedupDesignTradeoff-FAST'15.html)
### Restore Performances
@@ -62,6 +64,7 @@ In this repo, it records some paper related to storage system, including **Data
6. *Sliding Look-Back Window Assisted Data Chunk Rewriting for Improving Deduplication Restore Performance*----FAST'19 ([link](https://www.usenix.org/system/files/fast19-cao.pdf)) [summary](https://yzr95924.github.io/paper_summary/LookBackWindow-FAST'19.html)
7. *Improving Restore Speed for Backup Systems that Use Inline Chunk-Based Deduplication*---FAST'13 ([link](https://www.usenix.org/system/files/conference/fast13/fast13-final124.pdf)) [summary](https://yzr95924.github.io/paper_summary/ImproveRestore-FAST'13.html)
8. *Chunk Fragmentation Level: An Effective Indicator for Read Performance Degradation in Deduplication Storage*----HPCC'11
+9. *Improving the Restore Performance via Physical Locality Middleware for Backup Systems*----Middleware'20 ([link](https://dl.acm.org/doi/pdf/10.1145/3423211.3425691))
### Secure Deduplication
1. *Convergent Dispersal: Toward Storage-Efficient Security in a Cloud-of-Clouds*----HotStorage'14 ([link](https://www.cse.cuhk.edu.hk/~pclee/www/pubs/hotstorage14.pdf)) [summary](https://yzr95924.github.io/paper_summary/CAONT-RS-HotStorage'14.html)
@@ -90,10 +93,10 @@ In this repo, it records some paper related to storage system, including **Data
24. *PraDa: Privacy-preserving Data Deduplication as a Service*----CIKM'14 ([link](https://msuweb.montclair.edu/~dongb/publications/cikm2014.pdf))
25. *Privacy-Preserving Data Deduplication on Trusted Processors*----CLOUD'17 ([link](https://ieeexplore.ieee.org/document/8030573)) [summary]( https://yzr95924.github.io/paper_summary/PrivacyPreservingDedup-CLOUD'17.html )
26. *Distributed Key Generation for Encrypted Deduplication: Achieving the Strongest Privacy*----CCSW'14 ([link]( https://dl.acm.org/doi/abs/10.1145/2664168.2664169 )) [summary](https://yzr95924.github.io/paper_summary/DistributedKeyGen-CCSW'14.html)
+27. *Proofs of Ownership on Encrypted Cloud Data via Intel SGX*----ACNS'20 ([link](https://link.springer.com/chapter/10.1007/978-3-030-61638-0_22)) [summary](https://yzr95924.github.io/paper_summary/PoWSGX-ACNS'20.html)
### Computation Deduplication
-1. *SPEED: Accelerating Enclave Applications via Secure Deduplication*----ICDCS'19 ([link](https://conferences.computer.org/icdcs/2019/pdfs/ICDCS2019-49XpIlu3rRtYi2T0qVYnNX/5DGHpUvuZKbyIr6VRJc0zW/5PfoKBVnBKUPCcy8ruoayx.pdf))
2. *Secure Deduplication of General Computations*
@@ -267,18 +270,27 @@ In this repo, it records some paper related to storage system, including **Data
2. *Calibrating Noise to Sensitivity in Private Data Analysis*----TCC'06 ([link](https://www.microsoft.com/en-us/research/wp-content/uploads/2006/03/dmns06.pdf))
3. Privacy at Scale: Local Differential Privacy in Practice----SIGMOD'18 ([link](http://dimacs.rutgers.edu/~graham/pubs/papers/ldptutorial.pdf))
-### SGX
-
+### SGX Technique
+1. *Graphene-SGX: A Practical Library OS for Unmodified Applications on SGX*----USENIX ATC'17 ([link](https://www.usenix.org/system/files/conference/atc17/atc17-tsai.pdf))
+2. *Intel SGX Explained*----IACR'16 ([link]( https://eprint.iacr.org/2016/086.pdf ))
+3. *OpenSGX: An Open Platform for SGX Research*----NDSS'16 ([link](http://ina.kaist.ac.kr/~dongsuh/paper/opensgx.pdf))
+4. *SCONE: Secure Linux Containers with Intel SGX*----OSDI'16 ([link](https://www.usenix.org/system/files/conference/osdi16/osdi16-arnautov.pdf))
+5. *Varys: Protecting SGX Enclaves From Practical Side-Channel Attacks*---USENIX ATC'18 ([link](https://www.usenix.org/system/files/conference/atc18/atc18-oleksenko.pdf))
+6. *sgx-perf: A Performance Analysis Tool for Intel SGX Enclaves*----Middleware'18 ([link](https://www.ibr.cs.tu-bs.de/users/weichbr/papers/middleware2018.pdf)) [summary]( https://yzr95924.github.io/paper_summary/SGXPerf-Middleware'18.html )
+7. *TaLoS: Secure and Transparent TLS Termination inside SGX Enclaves*----arxiv'17 ([link](https://www.doc.ic.ac.uk/~fkelbert/papers/talos17.pdf)) [summary](https://yzr95924.github.io/paper_summary/talos-arxiv'17.html)
+8. *Switchless Calls Made Practical in Intel SGX*----SysTex'18 ([link](https://dl.acm.org/doi/pdf/10.1145/3268935.3268942))
+9. *Regaining Lost Seconds: Efficient Page Preloading for SGX Enclaves*----Middleware'20 ([link](https://dl.acm.org/doi/pdf/10.1145/3423211.3425673))
+
+### SGX Storage
1. *NEXUS: Practical and Secure Access Control on Untrusted Storage Platforms using Client-side SGX*----DSN'19 ([link](https://people.cs.pitt.edu/~adamlee/pubs/2019/djoko2019dsn-nexus.pdf))
-2. *Securing the Storage Data Path with SGX Enclaves*----arxiv'18 ([link](https://arxiv.org/abs/1806.10883))
+2. *Securing the Storage Data Path with SGX Enclaves*----arxiv'18 ([link](https://arxiv.org/abs/1806.10883)) [summary](https://yzr95924.github.io/paper_summary/StorageDataPathSGX-arxiv.html)
3. *EnclaveDB: A Secure Database using SGX*----S&P'18
4. *Isolating Operating System Components with Intel SGX*----SysTEX'16 ([link](https://faui1-files.cs.fau.de/filepool/projects/sgx-kernel/sgx-kernel.pdf))
-5. *SPEICHER: Securing LSM-based Key-Value Stores using Shielded Execution*----FAST'19 ([link](https://www.usenix.org/system/files/fast19-bailleu.pdf))
+5. *SPEICHER: Securing LSM-based Key-Value Stores using Shielded Execution*----FAST'19 ([link](https://www.usenix.org/system/files/fast19-bailleu.pdf)) [summary](https://yzr95924.github.io/paper_summary/SPEICHER-FAST'19.html)
6. *ShieldStore: Shielded In-memory Key-Value Storage with SGX*----EUROSYS'19 ([link]( http://calab.kaist.ac.kr:8080/~jhuh/papers/kim_eurosys19_shieldst.pdf )) [summary](https://yzr95924.github.io/paper_summary/ShieldStore-EuroSys'19.html)
7. SeGShare: Secure Group File Sharing in the Cloud using Enclaves----DSN'20 ([link](http://www.fkerschbaum.org/dsn20.pdf)) [summary](https://yzr95924.github.io/paper_summary/SeGShare-DSN'20.html)
-8. *Graphene-SGX: A Practical Library OS for Unmodified Applications on SGX*----USENIX ATC'17 ([link](https://www.usenix.org/system/files/conference/atc17/atc17-tsai.pdf))
-9. *Intel SGX Explained*----IACR'16 ([link]( https://eprint.iacr.org/2016/086.pdf ))
-10. *OpenSGX: An Open Platform for SGX Research*----NDSS'16 ([link](http://ina.kaist.ac.kr/~dongsuh/paper/opensgx.pdf))
+8. *DISKSHIELD: A Data Tamper-Resistant Storage for Intel SGX*----AsiaCCS'20 ([link](https://dl.acm.org/doi/pdf/10.1145/3320269.3384717))
+9. *SPEED: Accelerating Enclave Applications via Secure Deduplication*----ICDCS'19 ([link](https://conferences.computer.org/icdcs/2019/pdfs/ICDCS2019-49XpIlu3rRtYi2T0qVYnNX/5DGHpUvuZKbyIr6VRJc0zW/5PfoKBVnBKUPCcy8ruoayx.pdf)) [summary](https://yzr95924.github.io/paper_summary/SPEED-ICDCS'19.html)
### Network Security
diff --git a/StoragePaperNote/ChunkingAnalysisFramework-HP-TR'05.md b/StoragePaperNote/ChunkingAnalysisFramework-HP-TR'05.md
old mode 100644
new mode 100755
index ac1eb69..05a3b89
--- a/StoragePaperNote/ChunkingAnalysisFramework-HP-TR'05.md
+++ b/StoragePaperNote/ChunkingAnalysisFramework-HP-TR'05.md
@@ -1,59 +1,59 @@
----
-typora-copy-images-to: ../paper_figure
----
-A Framework for Analyzing and Improving Content-Based Chunking Algorithms
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| HP-TR'05 | Deduplication Chunking |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper proposes a framework to analyze the content-based chunking algorithms. And use this framework for the evaluation of the basic sliding window algorithm, and its two known variants.
-> focus on **stateless chunking algorithm**, do not consider the history of the sequence, or the state of a server where other versions of the sequence might be stored.
-
-**Chunking stability**: if it makes a small modification to data, turning into a new version, and apply the chunking algorithm to the new version of data
-
-> most of the chunk created for the new version are identical to the chunks created for the older version data.
-
-### Tow Thresholds, Two Divisors Algorithm (TTTD)
-
-- Analysis on Basic Sliding Window Algorithm (BSW)
-Basic workflow of the BSW:
-there is a pre-determined integer $D$, a fixed width sliding windows is moved across the file, and at every position in the file.
-> the content of this window are fingerprinted.
-> Highly efficient fingerprint algorithm: e.g., Rabins fingerprint is used for this purpose.
-> A position $k$ is declared to be a chunk boundary if there is a $D-match$ at $k$.
-
-- The concept of fingerprint match
-$S = s_1, s_2, ..., s_n$, a fingerprint function $h$ and a window length $l$, there is a $D-match$ at $k$ if, for some pre-determined $r \leq D$,
-
-$$
-h(W) \mod D = r, \text{where} (W = s_{k-l+1}, s_{k-l+2},....,s_k)
-$$
-
-
-
-> the property guarantees that BSW is stable under local modifications.
-
-- The analytic framework
-It proposes a quantitiative framework for analyzing and comparing different chunking algorithms.
-> **modification overhead**: the number of bytes that need to be transferred not because they are new data, but because chunk boundaries.
-> **modification index**: the normalization of the modification overhead by dividing it with mean chunk size.
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. This paper provides an analytic framework for evaluating and comparing chunking algorithms
-> analyzing the basic sliding window chunking algorithm and two of its existing variants.
-
-2. This paper also proposes a new chunking algorithm (TTTD) which performs much better than the existing algorithms.
-> do some experiments to validate the superiority of TTTD based on a large collection of real files.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: ../paper_figure
+---
+A Framework for Analyzing and Improving Content-Based Chunking Algorithms
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| HP-TR'05 | Deduplication Chunking |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper proposes a framework to analyze the content-based chunking algorithms. And use this framework for the evaluation of the basic sliding window algorithm, and its two known variants.
+> focus on **stateless chunking algorithm**, do not consider the history of the sequence, or the state of a server where other versions of the sequence might be stored.
+
+**Chunking stability**: if it makes a small modification to data, turning into a new version, and apply the chunking algorithm to the new version of data
+
+> most of the chunk created for the new version are identical to the chunks created for the older version data.
+
+### Tow Thresholds, Two Divisors Algorithm (TTTD)
+
+- Analysis on Basic Sliding Window Algorithm (BSW)
+Basic workflow of the BSW:
+there is a pre-determined integer $D$, a fixed width sliding windows is moved across the file, and at every position in the file.
+> the content of this window are fingerprinted.
+> Highly efficient fingerprint algorithm: e.g., Rabins fingerprint is used for this purpose.
+> A position $k$ is declared to be a chunk boundary if there is a $D-match$ at $k$.
+
+- The concept of fingerprint match
+$S = s_1, s_2, ..., s_n$, a fingerprint function $h$ and a window length $l$, there is a $D-match$ at $k$ if, for some pre-determined $r \leq D$,
+
+$$
+h(W) \mod D = r, \text{where} (W = s_{k-l+1}, s_{k-l+2},....,s_k)
+$$
+
+
+
+> the property guarantees that BSW is stable under local modifications.
+
+- The analytic framework
+It proposes a quantitiative framework for analyzing and comparing different chunking algorithms.
+> **modification overhead**: the number of bytes that need to be transferred not because they are new data, but because chunk boundaries.
+> **modification index**: the normalization of the modification overhead by dividing it with mean chunk size.
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. This paper provides an analytic framework for evaluating and comparing chunking algorithms
+> analyzing the basic sliding window chunking algorithm and two of its existing variants.
+
+2. This paper also proposes a new chunking algorithm (TTTD) which performs much better than the existing algorithms.
+> do some experiments to validate the superiority of TTTD based on a large collection of real files.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
1. this paper mentions the point that it is very necessary to concern the size variation of chunks in chunking algorithm.
\ No newline at end of file
diff --git a/StoragePaperNote/DedupDesignTradeoff-FAST'15.md b/StoragePaperNote/DedupDesignTradeoff-FAST'15.md
deleted file mode 100644
index d691ba7..0000000
--- a/StoragePaperNote/DedupDesignTradeoff-FAST'15.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-typora-copy-images-to: paper_figure
----
-Design Tradeoffs for Data Deduplication Performance in Backup Workloads
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'15 | Data Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-There have been many publications about data deduplication. However, it remains unclear how existing solutions make their design decisions and whether potential solutions can do better.
-
-Although, there are some open-source deduplication platforms, none of them are capable of evaluating the parameter space of this paper. Thus, it also presents a general-purpose deduplication framework.
-> for comprehersive data deduplication evaluation.
-
-### Method Name
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
diff --git a/StoragePaperNote/Deduplication/Cache-Dedup/LIPA-MSST'19.md b/StoragePaperNote/Deduplication/Cache-Dedup/LIPA-MSST'19.md
old mode 100644
new mode 100755
index fe8c7f3..2d788f5
--- a/StoragePaperNote/Deduplication/Cache-Dedup/LIPA-MSST'19.md
+++ b/StoragePaperNote/Deduplication/Cache-Dedup/LIPA-MSST'19.md
@@ -1,99 +1,99 @@
----
-typora-copy-images-to: ../paper_figure
----
-LIPA: A Learning-based Indexing and Prefetching Approach for Data Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| MSST'19 | Deduplication Index |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper uses the reinforcement learning framework to build an adaptive indexing structure.
-> to solve the chunk-lookup disk bottleneck problem for large-scale
-
-Current methods:
->1. a full chunk
->2. a sampled chunk index
-> drawback: hard to fit in RAM and sampled chunk index directly affects the deduplication ratio dependent on the sampling ratio.
-
-Problem: how to build an efficient fingerprint indexing to help identify duplicate data chunks?
-> chunk-lookup disk bottleneck problem
-
-### Learning-based Indexing Prefetching Approach (LIPA)
-Goal: propose a simple reinforcement learning method to learn how to prefetch a segment dynamically.
-> a trial-and-error prefectching and then gives a delayed reward during the data stream evolution.
-
-- Main idea
-Explore locality in context of data streams:
-> An incoming segment may share the same feature with previous neighboring segments
-
-This paper trains the **locality relationship** and deduplicate each incoming segment against only a few of its previous segments.
-
-By using reinforcement learning mythology,
-> it aims to identify the most similar segment (*champion*) for an incoming segment (**temporal locality**)
-> then prefetch several successive segments (*followers*) by exploiting **spatial locality**.
-
-- Segment similarity
- say two segments are similar if they share a number of the same chunks.
-- Reinforcement learning
-
-In this paper, it uses **K-armed contextual bandit**, the goal of the agent is to obtain rewards as much as possible in the long term.
-
-- The whole workflow
-
-the deduplication relies on the interaction between of the context table and the fingerprint table (**fingerprint cache**)
-
-
-- Feature sampling
-sample no larger than $4$ for a segment sized of $1024$, sample rate: $256:1$.
-
-- Champion choosing
-The context table usually keeps a score for each segment which reflects its contribution to deduplication in the past time.
-> choose a segment with the higher score
-
-- Reward feedback
-During the period of a champion in cache, it adds up all lookup hits until it is evicted from cache and then feedback the reward and update the corresponding score in the context table.
-> a reward value: the count of the lookup hits of the segment.
-> once update or incremental update
-
-### Implementation and Evaluation
-- Implementation
-Based on Destor: each phase corresponds to a thread.
-> chunking, hashing, indexing, storing
-
-- Evaluation
-1. Dataset
-> 1. Linux Kernel archival: 155 versions
-> 2. Vmdk: pre-made VM disk images for VMware's Virtual Appliance Market place.
-> 3. Fslhome: 9 users in 14 days from Sep. 16th to 30th in 2011
-> 4. FSL MacOS
-
-2. Deduplication ratio
-
-3. RAM usage
-memory overhead for the fingerprint index lookup.
-4. Data throughput
-
-
-## 2. Strength (Contributions of the paper)
-1. the main contribution of this work is that it proposes a new deduplication framework based on reinforcement learning for data deduplication.
-
-## 3. Weakness (Limitations of the paper)
-1. the rationale behind this work is still vague, it should explains
-> segment with higher score $\rightarrow$ higher lookup hits $\rightarrow$ should be cached?
-> it just shows this via experiments instead of theoretical analysis
-
-## 4. Future Works
-1. This work mentions the point that here lacks an adaptive feedback mechanism to adjust the mapping relationship to reflect the dynamics of incoming data streams.
-> refer from MSST'16: even similar users behave quite differently, which should be accounted for in future deduplication systems.
-
-2. The key reinforcement learning algorithm is based on K-armed bandit.
-> a very general learning framework for prefetching
-
-3. This paper mentions that it is important to find a better balance between
-> exploitation by selecting a segment known to be useful.
-> exploration by selecting a segment whose usefulness is unknown but which might provide a bigger reward and thus expand the known space.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+LIPA: A Learning-based Indexing and Prefetching Approach for Data Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| MSST'19 | Deduplication Index |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper uses the reinforcement learning framework to build an adaptive indexing structure.
+> to solve the chunk-lookup disk bottleneck problem for large-scale
+
+Current methods:
+>1. a full chunk
+>2. a sampled chunk index
+> drawback: hard to fit in RAM and sampled chunk index directly affects the deduplication ratio dependent on the sampling ratio.
+
+Problem: how to build an efficient fingerprint indexing to help identify duplicate data chunks?
+> chunk-lookup disk bottleneck problem
+
+### Learning-based Indexing Prefetching Approach (LIPA)
+Goal: propose a simple reinforcement learning method to learn how to prefetch a segment dynamically.
+> a trial-and-error prefectching and then gives a delayed reward during the data stream evolution.
+
+- Main idea
+Explore locality in context of data streams:
+> An incoming segment may share the same feature with previous neighboring segments
+
+This paper trains the **locality relationship** and deduplicate each incoming segment against only a few of its previous segments.
+
+By using reinforcement learning mythology,
+> it aims to identify the most similar segment (*champion*) for an incoming segment (**temporal locality**)
+> then prefetch several successive segments (*followers*) by exploiting **spatial locality**.
+
+- Segment similarity
+ say two segments are similar if they share a number of the same chunks.
+- Reinforcement learning
+
+In this paper, it uses **K-armed contextual bandit**, the goal of the agent is to obtain rewards as much as possible in the long term.
+
+- The whole workflow
+
+the deduplication relies on the interaction between of the context table and the fingerprint table (**fingerprint cache**)
+
+
+- Feature sampling
+sample no larger than $4$ for a segment sized of $1024$, sample rate: $256:1$.
+
+- Champion choosing
+The context table usually keeps a score for each segment which reflects its contribution to deduplication in the past time.
+> choose a segment with the higher score
+
+- Reward feedback
+During the period of a champion in cache, it adds up all lookup hits until it is evicted from cache and then feedback the reward and update the corresponding score in the context table.
+> a reward value: the count of the lookup hits of the segment.
+> once update or incremental update
+
+### Implementation and Evaluation
+- Implementation
+Based on Destor: each phase corresponds to a thread.
+> chunking, hashing, indexing, storing
+
+- Evaluation
+1. Dataset
+> 1. Linux Kernel archival: 155 versions
+> 2. Vmdk: pre-made VM disk images for VMware's Virtual Appliance Market place.
+> 3. Fslhome: 9 users in 14 days from Sep. 16th to 30th in 2011
+> 4. FSL MacOS
+
+2. Deduplication ratio
+
+3. RAM usage
+memory overhead for the fingerprint index lookup.
+4. Data throughput
+
+
+## 2. Strength (Contributions of the paper)
+1. the main contribution of this work is that it proposes a new deduplication framework based on reinforcement learning for data deduplication.
+
+## 3. Weakness (Limitations of the paper)
+1. the rationale behind this work is still vague, it should explains
+> segment with higher score $\rightarrow$ higher lookup hits $\rightarrow$ should be cached?
+> it just shows this via experiments instead of theoretical analysis
+
+## 4. Future Works
+1. This work mentions the point that here lacks an adaptive feedback mechanism to adjust the mapping relationship to reflect the dynamics of incoming data streams.
+> refer from MSST'16: even similar users behave quite differently, which should be accounted for in future deduplication systems.
+
+2. The key reinforcement learning algorithm is based on K-armed bandit.
+> a very general learning framework for prefetching
+
+3. This paper mentions that it is important to find a better balance between
+> exploitation by selecting a segment known to be useful.
+> exploration by selecting a segment whose usefulness is unknown but which might provide a bigger reward and thus expand the known space.
+
4. This work mentions that it can be strengthen LIPA to capture the regular and irregular patterns by using a number of upper-level access hints and file attributes.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Chunking/FastCDC-ATC'16.md b/StoragePaperNote/Deduplication/Chunking/FastCDC-ATC'16.md
old mode 100644
new mode 100755
index 9c341af..68b3c86
--- a/StoragePaperNote/Deduplication/Chunking/FastCDC-ATC'16.md
+++ b/StoragePaperNote/Deduplication/Chunking/FastCDC-ATC'16.md
@@ -1,103 +1,103 @@
----
-typora-copy-images-to: ../paper_figure
----
-FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ATC'16 | chunking |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-- Motivation
- - Existing CDC-based chunking introduces heavy CPU overhead
- - declare the chunk cut-points by computing and judging the rolling hashes of the data stream **byte-by-byte**.
- - Two parts: hashing and judging the cutpoints
- - By using Gear function, the bottleneck has shifted to the hash judgment stage.
-
-### FastCDC
-
-- Three key designs
- - Simplified but enhanced hash judgment
- - padding several zero bits into the mask
- - Sub-minimum chunk cut-point skipping
- - enlarge the minimum chunk size to maximize the chunking speed
- - Normalized chunking
- - normalizes the chunk size distribution to a small specified region
- - increase the deduplication ratio
- - reduce the number of small-sized chunks (can combine with the cut-point skipping technique above to maximize the CDC speed while without sacrificing the deduplication ratio.)
-
-- Gear hash function
- - an array of 256 random 64-bit integers to map the values of the byte contents in the sliding window.
- - using only three operations (i.e., +, <<, and an array lookup)
- - enabling it to move quickly through the data content for the purpose of CDC.
-
-
-
-
-- Optimizing hash judgement
- - Gear-based CDC employs the same conventional hash judgment used in the Rabin-based CDC
- - A certain number of the lowest bits of the fingerprint are used to declare the chunk cut-point.
- - FastCDC enlarges the sliding window size by padding a number of zero bits into the mask value
- - change the hash judgment statement
- - involve more bytes in the final hash judgment
- - minimizing the probability of chunking position collision
- - simplifying the hash judgment to accelerate CDC
- - in Rabin: fp mod D == r
- - in FastCDC: fp & Mask == 0 --> !fp & Mask
- - avoid the unnecessary comparison operation
-
-- Cut-point skipping
- - avoid the operations for hash calculation and judgment in the skipped region.
- - may reduce the deduplication ratio.
- - the cumulative distribution of chunk size in Rabin-based CDC (without the maximum and minimum chunk size requirements) follows **an exponential distribution**.
-
-- Normalized chunking
- - solve the problem of decreased deduplication ratio facing the cut-point skipping approach.
- - After normalized chunking, there are almost no chunks of size smaller than the minimum chunk size
- - By changing the number of '1' bits in FastCDC, the chunk-size distribution will be approximately normalized to a specific region (always larger than the minimum chunk size, instead of following the exponential distribution)
- - define two masks
- - more effective mask bits: increase chunk size
- - fewer effective mask bits: reduce chunk size
-
-
-- The whole algorithm
-
-
-
-### Implementation and Evaluation
-
-- Evaluation standard
- - deduplication ratio
- - chunking speed
- - the average generated chunk size
-
-- Compared with
- - FastCDC
- - Gear-based
- - AE-based
- - Rabin-based
-
-- Evaluation of optimizing hash judgement
-- Evaluation of cut-point skipping
-- Evaluation of normalized chunking
-- Comprehensive evaluation of FastCDC
-
-## 2. Strength (Contributions of the paper)
-1. propose a new chunking algorithm with three new designs
-> enhanced hash judgment
-> sub-minimum chunk cut-point skipping
-> normalized chunking
-
-
-## 3. Weakness (Limitations of the paper)
-
-1. the part of cut-point skipping is not clear
-
-## 4. Some Insights (Future work)
-
-1. The research direction in chunking algorithm
-> algorithmic-oriented CDC optimizations
-> hardware-oriented CDC optimizations
+---
+typora-copy-images-to: ../paper_figure
+---
+FastCDC: a Fast and Efficient Content-Defined Chunking Approach for Data Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ATC'16 | chunking |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+- Motivation
+ - Existing CDC-based chunking introduces heavy CPU overhead
+ - declare the chunk cut-points by computing and judging the rolling hashes of the data stream **byte-by-byte**.
+ - Two parts: hashing and judging the cutpoints
+ - By using Gear function, the bottleneck has shifted to the hash judgment stage.
+
+### FastCDC
+
+- Three key designs
+ - Simplified but enhanced hash judgment
+ - padding several zero bits into the mask
+ - Sub-minimum chunk cut-point skipping
+ - enlarge the minimum chunk size to maximize the chunking speed
+ - Normalized chunking
+ - normalizes the chunk size distribution to a small specified region
+ - increase the deduplication ratio
+ - reduce the number of small-sized chunks (can combine with the cut-point skipping technique above to maximize the CDC speed while without sacrificing the deduplication ratio.)
+
+- Gear hash function
+ - an array of 256 random 64-bit integers to map the values of the byte contents in the sliding window.
+ - using only three operations (i.e., +, <<, and an array lookup)
+ - enabling it to move quickly through the data content for the purpose of CDC.
+
+
+
+
+- Optimizing hash judgement
+ - Gear-based CDC employs the same conventional hash judgment used in the Rabin-based CDC
+ - A certain number of the lowest bits of the fingerprint are used to declare the chunk cut-point.
+ - FastCDC enlarges the sliding window size by padding a number of zero bits into the mask value
+ - change the hash judgment statement
+ - involve more bytes in the final hash judgment
+ - minimizing the probability of chunking position collision
+ - simplifying the hash judgment to accelerate CDC
+ - in Rabin: fp mod D == r
+ - in FastCDC: fp & Mask == 0 --> !fp & Mask
+ - avoid the unnecessary comparison operation
+
+- Cut-point skipping
+ - avoid the operations for hash calculation and judgment in the skipped region.
+ - may reduce the deduplication ratio.
+ - the cumulative distribution of chunk size in Rabin-based CDC (without the maximum and minimum chunk size requirements) follows **an exponential distribution**.
+
+- Normalized chunking
+ - solve the problem of decreased deduplication ratio facing the cut-point skipping approach.
+ - After normalized chunking, there are almost no chunks of size smaller than the minimum chunk size
+ - By changing the number of '1' bits in FastCDC, the chunk-size distribution will be approximately normalized to a specific region (always larger than the minimum chunk size, instead of following the exponential distribution)
+ - define two masks
+ - more effective mask bits: increase chunk size
+ - fewer effective mask bits: reduce chunk size
+
+
+- The whole algorithm
+
+
+
+### Implementation and Evaluation
+
+- Evaluation standard
+ - deduplication ratio
+ - chunking speed
+ - the average generated chunk size
+
+- Compared with
+ - FastCDC
+ - Gear-based
+ - AE-based
+ - Rabin-based
+
+- Evaluation of optimizing hash judgement
+- Evaluation of cut-point skipping
+- Evaluation of normalized chunking
+- Comprehensive evaluation of FastCDC
+
+## 2. Strength (Contributions of the paper)
+1. propose a new chunking algorithm with three new designs
+> enhanced hash judgment
+> sub-minimum chunk cut-point skipping
+> normalized chunking
+
+
+## 3. Weakness (Limitations of the paper)
+
+1. the part of cut-point skipping is not clear
+
+## 4. Some Insights (Future work)
+
+1. The research direction in chunking algorithm
+> algorithmic-oriented CDC optimizations
+> hardware-oriented CDC optimizations
diff --git a/StoragePaperNote/Deduplication/Chunking/FrequencyBasedChunking-MASCOTS'10.md b/StoragePaperNote/Deduplication/Chunking/FrequencyBasedChunking-MASCOTS'10.md
old mode 100644
new mode 100755
index 15e2ef9..34809f8
--- a/StoragePaperNote/Deduplication/Chunking/FrequencyBasedChunking-MASCOTS'10.md
+++ b/StoragePaperNote/Deduplication/Chunking/FrequencyBasedChunking-MASCOTS'10.md
@@ -1,122 +1,122 @@
----
-typora-copy-images-to: ../paper_figure
----
-Frequency Based Chunking for Data De-Duplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| MASCOTS'10 | Deduplication Chunking |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper proposes a frequency based chunking algorithm, which utilizes the chunk frequency information in the data stream to enhance the data deduplication gain.
-> Especially when the metadata overhead is taken into consideration.
-
-
-The popular baseline CDC algorithm employs a deterministic distinct sampling technique to determine the chunk boundaries based on the data content.
-
-**Drawback of CDC**:
-Although CDC is scalable and efficient, content defined chunking is essentially a random chunking algorithm.
-> does not guarantee the appeared frequency of the resultant chunks
-> may not be optimal for data dedup purpose.
-
-The only option for the CDC algorithm is to reduce the average chunk size
-> when the average chunk size is below a certain value, the gain in the reduction of redundant chunks is diminished by the increase of the metadata cost.
-
-
-### Frequency Based Chunking
-Main idea: after a coarse grained CDC algorithm, they then performs a second level (fine-grained) chunking by identifying the chunks with a frequency greater than a predefined threshold.
-> this method requires a strong assumption on the knowledge of the individual chunk frequency.
-> two steps: chunk frequency estimation + chunking
-
-- Data dedup gain
-Define the amount of duplicated chunks removed and the cost of the metadata. $D(s) = distinct\{c_1, c_2, ..., c_n\}$, $m = |D(s)|$
-$$
-gain_A(s) = folding_{factor}(s) - metadata(s)
-$$
-
-$$
-folding_{factor}(s) = \sum_{c_i \in distinct(s)} |c_i| (f(c_i) - 1)
-$$
-here, $f(c_i)$ is the number of times a unique chunk $c_i$ appears in the stream.
-
-
-$$
-metadata(s) = |index| + |chunklist|= (\frac{1}{8}mlog_2m + 20 n)
-$$
-$log_2m$ is the minimal length of a chunk pointer that points to each unique chunk. 20 is the 20-byte SHA-1 hash.
-
-
-
-- Frequency Estimation
-In this paper, it adopts the concept of parallel stream filter and designs a special parallel bloom filter to identify high-frequency chunks and to obtain their frequency estimates.
-
-**How to reduce the oveahead of estimation?**
-This algorithm may only require the knowledge of high-frequency chunk candidates.
-> One interesting observation is that a majority of the chunk candidates are of low frequencies.
-
-Based on this idea, this paper applies a filtering process to eliminate as many low-frequency chunks as possible.
-> reserve resource to obtain an accurate frequency estimate for the high-frequency chunks.
-
-
-1. Prefiltering
-require only one XOR operation for filtering decision per chunk candidate it observes.
-> has a parameter $r_0$ to control the sample rate
-> for example, $r_0 = \frac{1}{32}$, makes a chunk candidate pass if its Rabin's fingerprint value modulo 32 == 1
-
-2. Parallel filtering
-After prefiltering, it then starts to check the appearance of a given in all $V$ bloom filters.
-> If it can find this chunk in each of the bloom filters, then it lets this chunk pass through the parallel filter and start to count its frequency.
-> Otherwise, it records the chunk in one of the $V$ bloom filters randomly.
-
-Without considering the false positive caused by bloom filters, only chunk candidates with frequency greater than $V$ can possibly pass the parallel filtering process.
-
-Due to the efficiency of the bloom filter data structure, it only consumes a small amount of memory, in this way they can filter out majority of the low frequency chunk candidates with small amount of resource.
-
-It mentions that although there is a rich literature for detecting high frequency items in a data stream.
-> However, for this problem, the threshold for high frequency chunks is usually as low as 15.
-> The classical heavy hitter detection algorithms do not scale under such a low threshold.
-
-- Two-stage chunking
-Main idea: combining the coarse-grained CDC chunking with fine-grained frequency based chunking.
-
-
-
-### Implementation and Evaluation
-- Evaluation
-Datasets: three empirical datasets
-> two datasets (Linux and Nytimes): data streams exhibit high degree of redundancy
-> one dataset (Mailbox): dataset with low detectable redundancy
-
-Evaluation Criteria:
-> Duplicate Elimination Ratio
-> Average Chunk Size
-
-
-
-## 2. Strength (Contributions of the paper)
-1. the FBC algorithm persistently outperforms the CDC algorithm in terms of
-> achieving a better dedup gain
-> producing much less number of chunks
-
-2. This paper advances the notion of data dudep gain to compare different chunking algorithms in a realistic scenario. (Dedup saving and metadata overhead)
-
-3. propose a novel frequency based chunking algorithm
-> considers the frequency information of data segments during chunking process.
-
-4. design a statistical chunk frequency estimation algorithm.
-> with small memory footprint
-
-## 3. Weakness (Limitations of the paper)
-1. The writing of this paper is not very clear.
-
-2. For this approach, it needs an extra pass on the dataset to obtain frequency information
-> it needs to design a one-pass algorithm which conducts the frequency estimation and the chunking process simultaneously.
-> An intuitive idea: tracking top k frequency segments instead of tracking all segments with frequency greater than a certain threshold.
-
-## 4. Future Works
-1. In this paper, one thing I can use is the idea that using a filter to reduce the memory overhead in hashtable. (how about use the sketch as a filter?)
-
-2. This paper considers a fix-sized chunk to be frequent if its frequency no less than a pre-defined threshold $\delta$
+---
+typora-copy-images-to: ../paper_figure
+---
+Frequency Based Chunking for Data De-Duplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| MASCOTS'10 | Deduplication Chunking |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper proposes a frequency based chunking algorithm, which utilizes the chunk frequency information in the data stream to enhance the data deduplication gain.
+> Especially when the metadata overhead is taken into consideration.
+
+
+The popular baseline CDC algorithm employs a deterministic distinct sampling technique to determine the chunk boundaries based on the data content.
+
+**Drawback of CDC**:
+Although CDC is scalable and efficient, content defined chunking is essentially a random chunking algorithm.
+> does not guarantee the appeared frequency of the resultant chunks
+> may not be optimal for data dedup purpose.
+
+The only option for the CDC algorithm is to reduce the average chunk size
+> when the average chunk size is below a certain value, the gain in the reduction of redundant chunks is diminished by the increase of the metadata cost.
+
+
+### Frequency Based Chunking
+Main idea: after a coarse grained CDC algorithm, they then performs a second level (fine-grained) chunking by identifying the chunks with a frequency greater than a predefined threshold.
+> this method requires a strong assumption on the knowledge of the individual chunk frequency.
+> two steps: chunk frequency estimation + chunking
+
+- Data dedup gain
+Define the amount of duplicated chunks removed and the cost of the metadata. $D(s) = distinct\{c_1, c_2, ..., c_n\}$, $m = |D(s)|$
+$$
+gain_A(s) = folding_{factor}(s) - metadata(s)
+$$
+
+$$
+folding_{factor}(s) = \sum_{c_i \in distinct(s)} |c_i| (f(c_i) - 1)
+$$
+here, $f(c_i)$ is the number of times a unique chunk $c_i$ appears in the stream.
+
+
+$$
+metadata(s) = |index| + |chunklist|= (\frac{1}{8}mlog_2m + 20 n)
+$$
+$log_2m$ is the minimal length of a chunk pointer that points to each unique chunk. 20 is the 20-byte SHA-1 hash.
+
+
+
+- Frequency Estimation
+In this paper, it adopts the concept of parallel stream filter and designs a special parallel bloom filter to identify high-frequency chunks and to obtain their frequency estimates.
+
+**How to reduce the oveahead of estimation?**
+This algorithm may only require the knowledge of high-frequency chunk candidates.
+> One interesting observation is that a majority of the chunk candidates are of low frequencies.
+
+Based on this idea, this paper applies a filtering process to eliminate as many low-frequency chunks as possible.
+> reserve resource to obtain an accurate frequency estimate for the high-frequency chunks.
+
+
+1. Prefiltering
+require only one XOR operation for filtering decision per chunk candidate it observes.
+> has a parameter $r_0$ to control the sample rate
+> for example, $r_0 = \frac{1}{32}$, makes a chunk candidate pass if its Rabin's fingerprint value modulo 32 == 1
+
+2. Parallel filtering
+After prefiltering, it then starts to check the appearance of a given in all $V$ bloom filters.
+> If it can find this chunk in each of the bloom filters, then it lets this chunk pass through the parallel filter and start to count its frequency.
+> Otherwise, it records the chunk in one of the $V$ bloom filters randomly.
+
+Without considering the false positive caused by bloom filters, only chunk candidates with frequency greater than $V$ can possibly pass the parallel filtering process.
+
+Due to the efficiency of the bloom filter data structure, it only consumes a small amount of memory, in this way they can filter out majority of the low frequency chunk candidates with small amount of resource.
+
+It mentions that although there is a rich literature for detecting high frequency items in a data stream.
+> However, for this problem, the threshold for high frequency chunks is usually as low as 15.
+> The classical heavy hitter detection algorithms do not scale under such a low threshold.
+
+- Two-stage chunking
+Main idea: combining the coarse-grained CDC chunking with fine-grained frequency based chunking.
+
+
+
+### Implementation and Evaluation
+- Evaluation
+Datasets: three empirical datasets
+> two datasets (Linux and Nytimes): data streams exhibit high degree of redundancy
+> one dataset (Mailbox): dataset with low detectable redundancy
+
+Evaluation Criteria:
+> Duplicate Elimination Ratio
+> Average Chunk Size
+
+
+
+## 2. Strength (Contributions of the paper)
+1. the FBC algorithm persistently outperforms the CDC algorithm in terms of
+> achieving a better dedup gain
+> producing much less number of chunks
+
+2. This paper advances the notion of data dudep gain to compare different chunking algorithms in a realistic scenario. (Dedup saving and metadata overhead)
+
+3. propose a novel frequency based chunking algorithm
+> considers the frequency information of data segments during chunking process.
+
+4. design a statistical chunk frequency estimation algorithm.
+> with small memory footprint
+
+## 3. Weakness (Limitations of the paper)
+1. The writing of this paper is not very clear.
+
+2. For this approach, it needs an extra pass on the dataset to obtain frequency information
+> it needs to design a one-pass algorithm which conducts the frequency estimation and the chunking process simultaneously.
+> An intuitive idea: tracking top k frequency segments instead of tracking all segments with frequency greater than a certain threshold.
+
+## 4. Future Works
+1. In this paper, one thing I can use is the idea that using a filter to reduce the memory overhead in hashtable. (how about use the sketch as a filter?)
+
+2. This paper considers a fix-sized chunk to be frequent if its frequency no less than a pre-defined threshold $\delta$
> this threshold is determined based on domain knowledge on the datasets.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Chunking/RapidCDC-SoCC'19.md b/StoragePaperNote/Deduplication/Chunking/RapidCDC-SoCC'19.md
old mode 100644
new mode 100755
index 600650c..afd3744
--- a/StoragePaperNote/Deduplication/Chunking/RapidCDC-SoCC'19.md
+++ b/StoragePaperNote/Deduplication/Chunking/RapidCDC-SoCC'19.md
@@ -1,95 +1,95 @@
----
-typora-copy-images-to: ../paper_figure
----
-RapidCDC: Leveraging Duplicate Locality to Accelerate Chunking in CDC-based Deduplication Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SoCC'19 | Chunking |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-- CDC is compute-intensive and time-consuming
- - a major performance bottleneck of the CDC based deduplication system.
- - Should make the I/O devices, such as hard disks and SSDs be the performance bottleneck
-
-- Duplicate locality implication
- - the improvement of chunking speed has not been considered at all.
-
-- Main idea
- - Leverage **duplicate locality** to remove the need of byte-by-byte window rolling in the determination of chunk boundaries.
- - Duplicate locality: multiple duplicate chunks are likely to occur together.
- - the longer the sequence, the stronger the locality is.
-
-### RapidCDC
-
-- Quantitative analysis of duplicate locality
- - the layout of duplicate data (duplicate chunks are likely to stay together)
- - LQ sequence: deduplicatable chunks constitute a chunk sequence
-
-use the number of contiguous deduplicatable chunks immediately following the first deduplicatable chunk to quantify the locality.
-> the majority of duplicate chunks are in the LQ sequence.
-
-
-- Design idea
- - exploit the duplicate locality in the datasets to enable a chunking method which detects chunk boundaries *without a byte-by-byte window rolling*.
- - Allow a list of *next-chunk* sizes (**size list**) to be attached to a fingerprint
- - 2 bytes to record the chunk size
- - simpler relationship chain
- - chunk --> fingerprint --> the size of the next chunk (a list)
- - If the position is accepted, it can avoids rolling the window one byte at a time for thousands of times to reach the next chunk boundary.
- - If not accepted, it will try another next-chunk size in the size list of the duplicate chunk's fingerprint.
- - If still not accepted, it will reduce to original rolling window by byte-by-byte.
-
-- Accepting suggested chunk boundaries
- - Trade-off between performance gain and risk of performance penalty.
- - FF (Fast-forwarding only): without further checking
- - FF + RWT (Rolling window test): compute the rolling window of the position
- - FF + MT (Marker Test): compared the last byte, need to record the last byte of a chunk
- - FF + RWT + FPT (Fingerprint test): also compute the fingerprint of the chunk, test whether the fingerprint currently exists.
-
-- Maintaining list of next-chunk size
- - LRU policy to update the list (an ordered list)
-
-
-
-### Implementation and Evaluation
-- Datasets
- - Synthetic dataset
- - real-word dataset: Docker, Linux source code, Google-news
- - chunking hash functions: Robin and Gear (in FastCDC)
-
-- Evaluation
- - Impact of modification count and distribution
- - Impact of minimum chunk sizes and hash functions
- - Throughput of multi-threaded
- - Consider a directory, each thread is responsible for the chunking of one file.
-
-## 2. Strength (Contributions of the paper)
-
-1. propose a new chunking algorithm which leverages the duplicate locality to accelerate the chunking performance.
-
-2. an quantitative scheme to measure the duplicate locality.
-
-## 3. Weakness (Limitations of the paper)
-
-1. the idea is not novel
-
-## 4. Some Insights (Future work)
-
-1. For deduplication production system
- NetApp ONTAP system
- Dell EMC Data Domain: 4KB, 8KB, 12KB
-
- LBFS: 2KB, 16KB, 64KB
-
-2. The boundary-shift issue
- for insertion or deletion at the beginning of a store file.
-
-CDC chunking: a chunk boundary is determined at a byte offset which can satisfy a predefined chunking condition.
-
-3. the difference in the hash function in chunking
-the hash function used for chunk boundary detection is different from the one used for fingerprinting
-> does not need to be collision-resistant
+---
+typora-copy-images-to: ../paper_figure
+---
+RapidCDC: Leveraging Duplicate Locality to Accelerate Chunking in CDC-based Deduplication Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| SoCC'19 | Chunking |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+- CDC is compute-intensive and time-consuming
+ - a major performance bottleneck of the CDC based deduplication system.
+ - Should make the I/O devices, such as hard disks and SSDs be the performance bottleneck
+
+- Duplicate locality implication
+ - the improvement of chunking speed has not been considered at all.
+
+- Main idea
+ - Leverage **duplicate locality** to remove the need of byte-by-byte window rolling in the determination of chunk boundaries.
+ - Duplicate locality: multiple duplicate chunks are likely to occur together.
+ - the longer the sequence, the stronger the locality is.
+
+### RapidCDC
+
+- Quantitative analysis of duplicate locality
+ - the layout of duplicate data (duplicate chunks are likely to stay together)
+ - LQ sequence: deduplicatable chunks constitute a chunk sequence
+
+use the number of contiguous deduplicatable chunks immediately following the first deduplicatable chunk to quantify the locality.
+> the majority of duplicate chunks are in the LQ sequence.
+
+
+- Design idea
+ - exploit the duplicate locality in the datasets to enable a chunking method which detects chunk boundaries *without a byte-by-byte window rolling*.
+ - Allow a list of *next-chunk* sizes (**size list**) to be attached to a fingerprint
+ - 2 bytes to record the chunk size
+ - simpler relationship chain
+ - chunk --> fingerprint --> the size of the next chunk (a list)
+ - If the position is accepted, it can avoids rolling the window one byte at a time for thousands of times to reach the next chunk boundary.
+ - If not accepted, it will try another next-chunk size in the size list of the duplicate chunk's fingerprint.
+ - If still not accepted, it will reduce to original rolling window by byte-by-byte.
+
+- Accepting suggested chunk boundaries
+ - Trade-off between performance gain and risk of performance penalty.
+ - FF (Fast-forwarding only): without further checking
+ - FF + RWT (Rolling window test): compute the rolling window of the position
+ - FF + MT (Marker Test): compared the last byte, need to record the last byte of a chunk
+ - FF + RWT + FPT (Fingerprint test): also compute the fingerprint of the chunk, test whether the fingerprint currently exists.
+
+- Maintaining list of next-chunk size
+ - LRU policy to update the list (an ordered list)
+
+
+
+### Implementation and Evaluation
+- Datasets
+ - Synthetic dataset
+ - real-word dataset: Docker, Linux source code, Google-news
+ - chunking hash functions: Robin and Gear (in FastCDC)
+
+- Evaluation
+ - Impact of modification count and distribution
+ - Impact of minimum chunk sizes and hash functions
+ - Throughput of multi-threaded
+ - Consider a directory, each thread is responsible for the chunking of one file.
+
+## 2. Strength (Contributions of the paper)
+
+1. propose a new chunking algorithm which leverages the duplicate locality to accelerate the chunking performance.
+
+2. an quantitative scheme to measure the duplicate locality.
+
+## 3. Weakness (Limitations of the paper)
+
+1. the idea is not novel
+
+## 4. Some Insights (Future work)
+
+1. For deduplication production system
+ NetApp ONTAP system
+ Dell EMC Data Domain: 4KB, 8KB, 12KB
+
+ LBFS: 2KB, 16KB, 64KB
+
+2. The boundary-shift issue
+ for insertion or deletion at the beginning of a store file.
+
+CDC chunking: a chunk boundary is determined at a byte offset which can satisfy a predefined chunking condition.
+
+3. the difference in the hash function in chunking
+the hash function used for chunk boundary detection is different from the one used for fingerprinting
+> does not need to be collision-resistant
diff --git a/StoragePaperNote/Deduplication/Chunking/SSCDC-SYSTOR'19.md b/StoragePaperNote/Deduplication/Chunking/SSCDC-SYSTOR'19.md
old mode 100644
new mode 100755
index b66bf99..5ec2f32
--- a/StoragePaperNote/Deduplication/Chunking/SSCDC-SYSTOR'19.md
+++ b/StoragePaperNote/Deduplication/Chunking/SSCDC-SYSTOR'19.md
@@ -1,108 +1,108 @@
----
-typora-copy-images-to: ../paper_figure
----
-SS-CDC: A Two-stage Parallel Content-Defined Chunking for Deduplicating Backup Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SYSTOR'19 | Deduplication Chunking|
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- To complete backing up of a large amount of data within a tight time window, the system has to provide sufficiently **high backup performance**.
-> deduplication adds the significant performance overhead to the system (variable-size chunking process)
-> it needs to calculate a hash value for the rolling window at almost every byte offset of a file, which consumes significant CPU resource and has become a performance **bottleneck** in many backup storage system.
-
-- Chunking invariability
-To accelerating CDC-based deduplication, it can partition an input file into segments, and leverage parallel hardware.
-> chunking invariability requires a parallel chunking algorithm always generates the identical set of chunks independent of the parallelism degree and the segment size.
-> However, many parallel CDC algorithms do not provide this guarantee. (different from sequential CDC)
-
-
-This paper porposes a two-stage parallel CDC which enables full parallelism on chunking of a file without compromising deduplication ratio.
-> further exploits instruction-level SIMD parallelism, offload chunking to SIMD platforms.
-> guarantee the chunking invariability
-
-### SS-CDC
-- Chunk boundary
-A chunk boundary is determined when two conditions are met:
->1. the chunk is within the range of pre-defined value.
->2. the hash value of the rolling windom matches a pre-defined value.
-
-- Original CDC chunk process is sequential
-The process of determining a sequence of boundaries in a file is inherently **sequential**, as declaration of a new boundary depends on:
->1. the hash value of current rolling window
->2. the previous boundary's position
-
-- Main Idea
-the chunking process can be separated into two tasks:
->1. rolling window computation: generate all potential chunk boundaries (expensive + parallel)
->2. select chunk boundaries out of the candidate ones so that meets the chunk size requirements (lightweight + serialized)
->Goal: generate the identical set of chunk boundaries and the same deduplication ratio as the sequential CDC.![1560263818057]
-
-
-- Two-stages of SS-CDC algorithm
-
-
-
-**First stage**: find the candidate boundary via rolling hash, produces a set of chunk boundary candidates which statisfy the first condition.
-> the result is record in a bit array.
-> Multiple bits can be set simultaneously using SIMD instructions without using locks.
-
-**Second stage**: select the final chunk boundaries from the candidates, which can meet the minimum and maximum chunk size constraints.
-For an input data with $N$ bytes, the output bit array will be of $N$ bits.
->1. a bit '1' at the bit-offset $k$ in the bit array indicates a chunk boundary candidate at the byte-offset $k$ in the input file.
->2. scanning from its beginning, searching for the '1' bits that meet the minimum and maximum chunk size constraints. These offsets are the final chunk boundaries.
-
-- Paralleling Operations in SS-CDC
-1. The first stage can be paralleled by assigning a different thread for each segment.
-
-2. The second stage must be performed by sequentially to find the next chunk boundary which meets the minimum and maximum chunk size constraints.
-> How to improve? the bit array comtains mostly '0' bits, with only a few '1' bits, can regard it as 32-bit integer, if that integer is non-zero, a bit-by-bit checking is needed.
-
-
-- Integrate with AVX-512
-use an AVX register to store all values of all current rolling windows (one for each segment)
-> use some AVX instruction to accelerate the speed.
-
-- Sacle to multiple cores
-A chunking thread is started at each core, and retrieves a batch of $N$ segments each time from the head of the queue for the first stage chunking
-> a lock is required to enforce an exclusive access to the queue.
-
-### Implementation and Evaluation
-- Evaluation
-Datasets:
->1. Linux source codes: from Linux Kernel Archives (transform tar $\rightarrow$ mtar)
->2. Docker Hub image: each image is tar file
-
-1. Chunking speed
-compared with sequential CDC with one thread running on one core
-> 400MB/s $\rightarrow$ 1500MB/s (3.3 $\times$)
-> achieved by leveraging the instruction-level parallelism
-
-Minimum chunk size also impacts the speed up of SS-CDC.
-
-2. The case of multiple cores for multiple files
-Multithreading of single file suffers from the same bottlenecks
-> as the use of a lock at the segment queue, barrier synchronization at the end of the first stage.
-
-## 2. Strength (Contributions of the paper)
-1. identify the root cause fo the deduplication ratio degradation of existing parallel CDC methods
-> provide quantitative analysis
-
-2. This is the first work to use Intel AVX instructions for parallel chunking.
-
-3. The key contribution of this work:
-SS-CDC guarantees chunking invariability and achieves parallel chunking performance without impacting deduplication ratio.
-
-## 3. Weakness (Limitations of the paper)
-1. SS-CDC actually needs to read more data and do more rolling hash calculation than sequential chunking.
-> it does not skip the input data using the minimum chunk size.
-> SS-CDC has to scan and calcualate a hash for every byte.
-
-
-## 4. Future Works
-1. This paper catches two limitations of the current parallelized chunking algorithms
-> chunking invariability
+---
+typora-copy-images-to: ../paper_figure
+---
+SS-CDC: A Two-stage Parallel Content-Defined Chunking for Deduplicating Backup Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| SYSTOR'19 | Deduplication Chunking|
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- To complete backing up of a large amount of data within a tight time window, the system has to provide sufficiently **high backup performance**.
+> deduplication adds the significant performance overhead to the system (variable-size chunking process)
+> it needs to calculate a hash value for the rolling window at almost every byte offset of a file, which consumes significant CPU resource and has become a performance **bottleneck** in many backup storage system.
+
+- Chunking invariability
+To accelerating CDC-based deduplication, it can partition an input file into segments, and leverage parallel hardware.
+> chunking invariability requires a parallel chunking algorithm always generates the identical set of chunks independent of the parallelism degree and the segment size.
+> However, many parallel CDC algorithms do not provide this guarantee. (different from sequential CDC)
+
+
+This paper porposes a two-stage parallel CDC which enables full parallelism on chunking of a file without compromising deduplication ratio.
+> further exploits instruction-level SIMD parallelism, offload chunking to SIMD platforms.
+> guarantee the chunking invariability
+
+### SS-CDC
+- Chunk boundary
+A chunk boundary is determined when two conditions are met:
+>1. the chunk is within the range of pre-defined value.
+>2. the hash value of the rolling windom matches a pre-defined value.
+
+- Original CDC chunk process is sequential
+The process of determining a sequence of boundaries in a file is inherently **sequential**, as declaration of a new boundary depends on:
+>1. the hash value of current rolling window
+>2. the previous boundary's position
+
+- Main Idea
+the chunking process can be separated into two tasks:
+>1. rolling window computation: generate all potential chunk boundaries (expensive + parallel)
+>2. select chunk boundaries out of the candidate ones so that meets the chunk size requirements (lightweight + serialized)
+>Goal: generate the identical set of chunk boundaries and the same deduplication ratio as the sequential CDC.![1560263818057]
+
+
+- Two-stages of SS-CDC algorithm
+
+
+
+**First stage**: find the candidate boundary via rolling hash, produces a set of chunk boundary candidates which statisfy the first condition.
+> the result is record in a bit array.
+> Multiple bits can be set simultaneously using SIMD instructions without using locks.
+
+**Second stage**: select the final chunk boundaries from the candidates, which can meet the minimum and maximum chunk size constraints.
+For an input data with $N$ bytes, the output bit array will be of $N$ bits.
+>1. a bit '1' at the bit-offset $k$ in the bit array indicates a chunk boundary candidate at the byte-offset $k$ in the input file.
+>2. scanning from its beginning, searching for the '1' bits that meet the minimum and maximum chunk size constraints. These offsets are the final chunk boundaries.
+
+- Paralleling Operations in SS-CDC
+1. The first stage can be paralleled by assigning a different thread for each segment.
+
+2. The second stage must be performed by sequentially to find the next chunk boundary which meets the minimum and maximum chunk size constraints.
+> How to improve? the bit array comtains mostly '0' bits, with only a few '1' bits, can regard it as 32-bit integer, if that integer is non-zero, a bit-by-bit checking is needed.
+
+
+- Integrate with AVX-512
+use an AVX register to store all values of all current rolling windows (one for each segment)
+> use some AVX instruction to accelerate the speed.
+
+- Sacle to multiple cores
+A chunking thread is started at each core, and retrieves a batch of $N$ segments each time from the head of the queue for the first stage chunking
+> a lock is required to enforce an exclusive access to the queue.
+
+### Implementation and Evaluation
+- Evaluation
+Datasets:
+>1. Linux source codes: from Linux Kernel Archives (transform tar $\rightarrow$ mtar)
+>2. Docker Hub image: each image is tar file
+
+1. Chunking speed
+compared with sequential CDC with one thread running on one core
+> 400MB/s $\rightarrow$ 1500MB/s (3.3 $\times$)
+> achieved by leveraging the instruction-level parallelism
+
+Minimum chunk size also impacts the speed up of SS-CDC.
+
+2. The case of multiple cores for multiple files
+Multithreading of single file suffers from the same bottlenecks
+> as the use of a lock at the segment queue, barrier synchronization at the end of the first stage.
+
+## 2. Strength (Contributions of the paper)
+1. identify the root cause fo the deduplication ratio degradation of existing parallel CDC methods
+> provide quantitative analysis
+
+2. This is the first work to use Intel AVX instructions for parallel chunking.
+
+3. The key contribution of this work:
+SS-CDC guarantees chunking invariability and achieves parallel chunking performance without impacting deduplication ratio.
+
+## 3. Weakness (Limitations of the paper)
+1. SS-CDC actually needs to read more data and do more rolling hash calculation than sequential chunking.
+> it does not skip the input data using the minimum chunk size.
+> SS-CDC has to scan and calcualate a hash for every byte.
+
+
+## 4. Future Works
+1. This paper catches two limitations of the current parallelized chunking algorithms
+> chunking invariability
> cannot take advantage of instruction-level parallelism offered by the SIMD platforms. (frequent branches)
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/CodePlugin-HotCloud'15.md b/StoragePaperNote/Deduplication/CodePlugin-HotCloud'15.md
old mode 100644
new mode 100755
index f9037ea..e28ab1d
--- a/StoragePaperNote/Deduplication/CodePlugin-HotCloud'15.md
+++ b/StoragePaperNote/Deduplication/CodePlugin-HotCloud'15.md
@@ -1,55 +1,55 @@
----
-typora-copy-images-to: paper_figure
----
-# CodePlugin: Plugging Deduplication into Erasure Coding for Cloud Storage
-@HotCloud'15 @Deduplication && Erasure Code
-[TOC]
-## Summary
-***Motivation of this paper***: To simplify the implementations of erasure coding schemes. Many system only support file appending operations. This features leads to a non-trivial and increasing portion of redundant data on cloud storage systems.
-Such redundancy leads to several consequences:
->1. extra storage has to to be used to acommodate such redundant data.
->2. extra coding (I/O) cost has to be paid since the redindant data have to be encoded as well.
-
-***CodePlugin***
-**Main Idea**: CodePlugin introduces some *pre-processing* steps before the normal encoding.
-> In these pre-processing steps, the data duplications are identified and properly shuffled so that the redundant blocks do not have to be encoded.
-
-- CodePlugin Design
->1. the de-duplicating step that tries to identify the redundant blocks
->2. the pseudo-shuffling step. which is to virtually re-arrange the positions of data blocks so that encoding is only needed on a subset of blocks.
->3. the optimal sub-files exchanging step, which can further reduce the number of blocks to be encoded.
-
-- CodePlugin uses a cache to keep the fingerprints. In this cache, a hash table is responsible for mapping fingerprints of unique blocks to their address.
-> When a file is chunked and all fingerprints are generated, after comparing these fingerprints to hte ones in the cache, CodePlugin can tell which blocks are redundant.
-
-
-After the file is chunked, the blocks are identified via **3-tuple** $(fid, sid, cid)$ address, which is composed of the corresponding *file id (fid)*, *sub-file id (sid)* and *column id (cid)*.
-
-
-- Pseudo-shuffling
-To encode the unique blocks together, and leave the redundant blocks untouched
-Since it wants to keep the file untouched, it just needs record the original address of the block instead of moving the actual block around.
-
-
-
-***Implementation and Evaluation***:
-**Impementation**
-none
-**Evaluation**
-
-- Preliminary experiments based on some real-world VM workloads
-1. It foucses on the improvment of CodePlugin (with different coding parameters)
->1. encoding throughput
->2. storage space
-
-2. For the CodePlugin overhead, it evaluates the throughput of pre-processing with different coding schemes.
-
-3. It also tests the throughput by varying cache and coding parameters.
-
-## Strength (Contributions of the paper)
-1. This paper proposes the CodePlugin, a mechanism that is applicable to any existing erasure coding scheme.
-2. It also conducts the experiments based on some real-world cloud VM images
-## Weakness (Limitations of the paper)
-1. In order to deduplicate the redundant data, CodePlugin mainly introduces overhead from the two aspects: the CPU cost (MD5 fingerprint) and the storage cost (Map-Address file)
-## Future Works
+---
+typora-copy-images-to: paper_figure
+---
+# CodePlugin: Plugging Deduplication into Erasure Coding for Cloud Storage
+@HotCloud'15 @Deduplication && Erasure Code
+[TOC]
+## Summary
+***Motivation of this paper***: To simplify the implementations of erasure coding schemes. Many system only support file appending operations. This features leads to a non-trivial and increasing portion of redundant data on cloud storage systems.
+Such redundancy leads to several consequences:
+>1. extra storage has to to be used to acommodate such redundant data.
+>2. extra coding (I/O) cost has to be paid since the redindant data have to be encoded as well.
+
+***CodePlugin***
+**Main Idea**: CodePlugin introduces some *pre-processing* steps before the normal encoding.
+> In these pre-processing steps, the data duplications are identified and properly shuffled so that the redundant blocks do not have to be encoded.
+
+- CodePlugin Design
+>1. the de-duplicating step that tries to identify the redundant blocks
+>2. the pseudo-shuffling step. which is to virtually re-arrange the positions of data blocks so that encoding is only needed on a subset of blocks.
+>3. the optimal sub-files exchanging step, which can further reduce the number of blocks to be encoded.
+
+- CodePlugin uses a cache to keep the fingerprints. In this cache, a hash table is responsible for mapping fingerprints of unique blocks to their address.
+> When a file is chunked and all fingerprints are generated, after comparing these fingerprints to hte ones in the cache, CodePlugin can tell which blocks are redundant.
+
+
+After the file is chunked, the blocks are identified via **3-tuple** $(fid, sid, cid)$ address, which is composed of the corresponding *file id (fid)*, *sub-file id (sid)* and *column id (cid)*.
+
+
+- Pseudo-shuffling
+To encode the unique blocks together, and leave the redundant blocks untouched
+Since it wants to keep the file untouched, it just needs record the original address of the block instead of moving the actual block around.
+
+
+
+***Implementation and Evaluation***:
+**Impementation**
+none
+**Evaluation**
+
+- Preliminary experiments based on some real-world VM workloads
+1. It foucses on the improvment of CodePlugin (with different coding parameters)
+>1. encoding throughput
+>2. storage space
+
+2. For the CodePlugin overhead, it evaluates the throughput of pre-processing with different coding schemes.
+
+3. It also tests the throughput by varying cache and coding parameters.
+
+## Strength (Contributions of the paper)
+1. This paper proposes the CodePlugin, a mechanism that is applicable to any existing erasure coding scheme.
+2. It also conducts the experiments based on some real-world cloud VM images
+## Weakness (Limitations of the paper)
+1. In order to deduplicate the redundant data, CodePlugin mainly introduces overhead from the two aspects: the CPU cost (MD5 fingerprint) and the storage cost (Map-Address file)
+## Future Works
1. this work just considers the case of fixed sized chunking, how about the case of the variable sized chunking. Because the variable sized chunking has the high efficiency in detecting the redundant data.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Dedup-Estmation/ContentBasedSampling-ATC'13.md b/StoragePaperNote/Deduplication/Dedup-Estmation/ContentBasedSampling-ATC'13.md
old mode 100644
new mode 100755
index 6f6f552..0ed7c08
--- a/StoragePaperNote/Deduplication/Dedup-Estmation/ContentBasedSampling-ATC'13.md
+++ b/StoragePaperNote/Deduplication/Dedup-Estmation/ContentBasedSampling-ATC'13.md
@@ -1,56 +1,56 @@
----
-typora-copy-images-to: paper_figure
----
-Estimating Duplication by Content-based Sampling
-------------------------------------------
-| Venue | Category |
-| :----: | :------------------: |
-| ATC'13 | Deduplication Sample |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The benefit of deduplication in a primary storage system varies for different workloads.
-> For a certain workload that have a low level of deduplication, one would to turn off the deduplication feature to avoid its effect on **I/O performance** and to avoid the **metadata overhead** of deduplication.
-
-It is necessary for the estimator to allow customers to quickly estimate the deduplication benefit on their primary data.
-> existing deduplication estimators are either not fast enough or not accurate enough.
-
-### Content-based Sample
-
-- Theory
-Assume a hash function that generates a fingerprint for each data block. The proposed **content-based sampling** applies a modulo-based filter to all the block fingerprints of a data set.
-> A block fingerprint passes the filter and is added to the sample
-> $Fingerprint$ Mod $M$ == $X$, $M$ is the filter divisor.
-
-**Idea**: split the fingerprint space into $M$ partitions, and to use one of the partitions in the estimation.
-
-> The estimation of distinct block size in the whole data set: $S^* = M \times S_{sample}$
-> can also consider the case where the size of block is different.
-
-
-
-- How to set $M$?
-Define a $\alpha-\beta$ accuracy notion, to prove how to choose $M$ under the given $\alpha$ and $\beta$.
-
-### Implementation and Evaluation
-- Changing logging module: samples data blocks during the consistency point. (for update)
-- Disk Scanner: sample existing blocks in the volume.
-- All the sample will store in a fingerprint sample file (FPS).
-- Estimation Operation:
-merges the sample from change logging to the FPS, and update the estimate accordingly.
-
-- Experiment
-1. Accuracy
-2. Performance degradation: increase the rate of random access for counting module.
-
-
-## 2. Strength (Contributions of the paper)
-1. this work provides a **single-scan-pass** method for estimating deduplication ratio in a large dataset, also, with statistically guaranteed accuracy.
-2. This work also consider to tackle the rapid change of dataset.
-3. It implements this technique in a real commercial storage system for real world dataset. It also verifies the impact of performance (less than a few percent).
-## 3. Weakness (Limitations of the paper)
-1. Theory of proving the correctness bound of is not very clear, I cannot understand the rationale behind them.
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+Estimating Duplication by Content-based Sampling
+------------------------------------------
+| Venue | Category |
+| :----: | :------------------: |
+| ATC'13 | Deduplication Sample |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The benefit of deduplication in a primary storage system varies for different workloads.
+> For a certain workload that have a low level of deduplication, one would to turn off the deduplication feature to avoid its effect on **I/O performance** and to avoid the **metadata overhead** of deduplication.
+
+It is necessary for the estimator to allow customers to quickly estimate the deduplication benefit on their primary data.
+> existing deduplication estimators are either not fast enough or not accurate enough.
+
+### Content-based Sample
+
+- Theory
+Assume a hash function that generates a fingerprint for each data block. The proposed **content-based sampling** applies a modulo-based filter to all the block fingerprints of a data set.
+> A block fingerprint passes the filter and is added to the sample
+> $Fingerprint$ Mod $M$ == $X$, $M$ is the filter divisor.
+
+**Idea**: split the fingerprint space into $M$ partitions, and to use one of the partitions in the estimation.
+
+> The estimation of distinct block size in the whole data set: $S^* = M \times S_{sample}$
+> can also consider the case where the size of block is different.
+
+
+
+- How to set $M$?
+Define a $\alpha-\beta$ accuracy notion, to prove how to choose $M$ under the given $\alpha$ and $\beta$.
+
+### Implementation and Evaluation
+- Changing logging module: samples data blocks during the consistency point. (for update)
+- Disk Scanner: sample existing blocks in the volume.
+- All the sample will store in a fingerprint sample file (FPS).
+- Estimation Operation:
+merges the sample from change logging to the FPS, and update the estimate accordingly.
+
+- Experiment
+1. Accuracy
+2. Performance degradation: increase the rate of random access for counting module.
+
+
+## 2. Strength (Contributions of the paper)
+1. this work provides a **single-scan-pass** method for estimating deduplication ratio in a large dataset, also, with statistically guaranteed accuracy.
+2. This work also consider to tackle the rapid change of dataset.
+3. It implements this technique in a real commercial storage system for real world dataset. It also verifies the impact of performance (less than a few percent).
+## 3. Weakness (Limitations of the paper)
+1. Theory of proving the correctness bound of is not very clear, I cannot understand the rationale behind them.
+
+## 4. Future Works
This paper provides a very simple sample method based on fingerprint, i.e., content-based, this is not very novel from my perspective. And this paper also presents the whole theory to support its idea. The insight is for large dataset, it is possible to estimate the deduplication ratio with sampling.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Dedup-Estmation/EstimateDedupRatio-MSST'12.md b/StoragePaperNote/Deduplication/Dedup-Estmation/EstimateDedupRatio-MSST'12.md
old mode 100644
new mode 100755
index 40439cd..dd6225a
--- a/StoragePaperNote/Deduplication/Dedup-Estmation/EstimateDedupRatio-MSST'12.md
+++ b/StoragePaperNote/Deduplication/Dedup-Estmation/EstimateDedupRatio-MSST'12.md
@@ -1,118 +1,118 @@
----
-typora-copy-images-to: ../paper_figure
----
-Estimation of Deduplication Ratios in Large Data Sets
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| MSST'12 | Deduplication Estimation |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper studies the problem of accurately estimating the data reduction ratio achieved by deduplication and compression on a *specific* data set.
-> this paper focuses on what can be done when scanning the entire data set.
-> using RAM as little as possible
-> without reading most of the actual data.
-
-The method of this work can be used to:
->1. estimating the number of disks to buy
->2. choosing a deduplication technique
->3. deciding whether to dedupe or not dedupe
-
-### Method Name
-- Key assumption
-This paper studies what can be done under **essentially the whole data set is scanned**.
-> how to estimate the deduplication and compression in an efficient manner.
-
-1. The main bulk
-reading data from disk, computing the hash, running a compression algorithm and updating the index table.
-> heaviest: access to the disk, and compression.
-
-- A general framework
-1. Sample phase
-the sample is taken at random where each element appers independently.
-> Hash values and compression rates are computed for each element in the sample.
-
-From the entire dataset, it would choose $m$ elements (a configurable parameter in advance), and compute the corresponding hash vale and add it to a set as the base sample.
-> record the count
-
-**Rationale**: in this paper's case, an element that two replicas in the dat set has double the probability of being included in the base sample.
-
-**Sample Method**:
-> 1. choose $m$ random numbers in $\{1, ...,n\}$
-> 2. generate a random number according to binomial distribution.
-
-2. Scan phase (used to derive the data-reduction ratio estimate)
-store statistics only about elements in the base sample.
-> The hash is computed for each element but it is only recorded if it matches a hash of an element in the base sample.
-
-The entire dataset is scanned, for each element, its hash signature is computed.
-> If this signature is matched in the base sample, then the corresponding count is incremented by 1.
-
-It finally estimates the
-$$
-Est = \frac{1}{m} \sum_{i \in Sample} \frac{BaseCount_{i}}{Total_{i}}
-$$
-
-- Full file deduplication
-deduplication is only done between identical files
-> achieves less than optimal deduplication ratios, yet it is easy to implement and can perform sufficiently well in some workloads.
-
-1. Sampling files
-> In this paper, the actual data needs to be read only for a small fraction of the file, which is related to the base sample.
-> In handling of files, each such byte has independent probability $\frac{m}{N}$, where $N$ is the total number of byte in the dataset.
-
-2. Scan optimization
-hash on the first block of the file
-> in many file system, the first block resides in the i-node of the file.
-> thus can be read quickly during a metadata scan without the addition of extra disk seeks.
-
-3. Using the following information
-> 1. the length of the file
-> 2. A hash signature of the first block of the file.
-
-If those two messages of a file can match, then compute the full hash of this file, and count the frequency.
-
-### Implementation and Evaluation
-- Implemention
-the scan phase can be run in parallel on a distributed system. The parallel execution would each node will do the scan locally and accumulate the count for the data adjacent to this node.
-
-
-- Evaluation
-1. Dataset (4 datasets)
-personal workstations, enterprise file system repository, and backup date of two types.
-2. Empirical accuracy
-Sample size vs. relative error
-
-
-## 2. Strength (Contributions of the paper)
-1. This paper sets the grounds on the limitations and inherent difficulties of sampling techniques.
-> both analytical and in practice
-> does not suffice to distinguish if the replication is a local phenomena or a global one.
-
-
-## 3. Weakness (Limitations of the paper)
-1. The method of this paper still needs to pass the whole dataset twice, which may incur the overhead of disk I/O.
-
-
-2. the higher the data-reducation ratio is, the harder it becomes to give an accurate estimation.
-> In order to determine the sample size, this method needs to give a bound of deduplication ratio
-
-## 4. Future Works
-1. This paper mentions that it is impossible to predict the deduplication ratio accurately by looking only at a random subset of the data.
-> can give arbitrarily skewed results.
-> whether randomly or according to various sampling methodologies.
-
-2. This paper also mentions it can incorporate major knowledge about the structure of the data to smartly sample it can then extrapolate what the overall ratio yet perform this efficiently with limited resources.
-> "educated-sampling"
-
-3. This problem can be classified into the question of estimating the number of distinct elements in a large collection of elements.
-
-4. This paper argues that the knowledge of the total number of chunks in the data set is **essential** in the sampling process.
-> the overall size of the data set can be computed by a standard traversal of the file system
-> e.g., the unix $du$ command
-
-5. This paper mentions how to sample the file with variable-size chunking:
-> the sampling should choose exact offsets in the file, and then choose the chunk which contains this offset.
+---
+typora-copy-images-to: ../paper_figure
+---
+Estimation of Deduplication Ratios in Large Data Sets
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| MSST'12 | Deduplication Estimation |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper studies the problem of accurately estimating the data reduction ratio achieved by deduplication and compression on a *specific* data set.
+> this paper focuses on what can be done when scanning the entire data set.
+> using RAM as little as possible
+> without reading most of the actual data.
+
+The method of this work can be used to:
+>1. estimating the number of disks to buy
+>2. choosing a deduplication technique
+>3. deciding whether to dedupe or not dedupe
+
+### Method Name
+- Key assumption
+This paper studies what can be done under **essentially the whole data set is scanned**.
+> how to estimate the deduplication and compression in an efficient manner.
+
+1. The main bulk
+reading data from disk, computing the hash, running a compression algorithm and updating the index table.
+> heaviest: access to the disk, and compression.
+
+- A general framework
+1. Sample phase
+the sample is taken at random where each element appers independently.
+> Hash values and compression rates are computed for each element in the sample.
+
+From the entire dataset, it would choose $m$ elements (a configurable parameter in advance), and compute the corresponding hash vale and add it to a set as the base sample.
+> record the count
+
+**Rationale**: in this paper's case, an element that two replicas in the dat set has double the probability of being included in the base sample.
+
+**Sample Method**:
+> 1. choose $m$ random numbers in $\{1, ...,n\}$
+> 2. generate a random number according to binomial distribution.
+
+2. Scan phase (used to derive the data-reduction ratio estimate)
+store statistics only about elements in the base sample.
+> The hash is computed for each element but it is only recorded if it matches a hash of an element in the base sample.
+
+The entire dataset is scanned, for each element, its hash signature is computed.
+> If this signature is matched in the base sample, then the corresponding count is incremented by 1.
+
+It finally estimates the
+$$
+Est = \frac{1}{m} \sum_{i \in Sample} \frac{BaseCount_{i}}{Total_{i}}
+$$
+
+- Full file deduplication
+deduplication is only done between identical files
+> achieves less than optimal deduplication ratios, yet it is easy to implement and can perform sufficiently well in some workloads.
+
+1. Sampling files
+> In this paper, the actual data needs to be read only for a small fraction of the file, which is related to the base sample.
+> In handling of files, each such byte has independent probability $\frac{m}{N}$, where $N$ is the total number of byte in the dataset.
+
+2. Scan optimization
+hash on the first block of the file
+> in many file system, the first block resides in the i-node of the file.
+> thus can be read quickly during a metadata scan without the addition of extra disk seeks.
+
+3. Using the following information
+> 1. the length of the file
+> 2. A hash signature of the first block of the file.
+
+If those two messages of a file can match, then compute the full hash of this file, and count the frequency.
+
+### Implementation and Evaluation
+- Implemention
+the scan phase can be run in parallel on a distributed system. The parallel execution would each node will do the scan locally and accumulate the count for the data adjacent to this node.
+
+
+- Evaluation
+1. Dataset (4 datasets)
+personal workstations, enterprise file system repository, and backup date of two types.
+2. Empirical accuracy
+Sample size vs. relative error
+
+
+## 2. Strength (Contributions of the paper)
+1. This paper sets the grounds on the limitations and inherent difficulties of sampling techniques.
+> both analytical and in practice
+> does not suffice to distinguish if the replication is a local phenomena or a global one.
+
+
+## 3. Weakness (Limitations of the paper)
+1. The method of this paper still needs to pass the whole dataset twice, which may incur the overhead of disk I/O.
+
+
+2. the higher the data-reducation ratio is, the harder it becomes to give an accurate estimation.
+> In order to determine the sample size, this method needs to give a bound of deduplication ratio
+
+## 4. Future Works
+1. This paper mentions that it is impossible to predict the deduplication ratio accurately by looking only at a random subset of the data.
+> can give arbitrarily skewed results.
+> whether randomly or according to various sampling methodologies.
+
+2. This paper also mentions it can incorporate major knowledge about the structure of the data to smartly sample it can then extrapolate what the overall ratio yet perform this efficiently with limited resources.
+> "educated-sampling"
+
+3. This problem can be classified into the question of estimating the number of distinct elements in a large collection of elements.
+
+4. This paper argues that the knowledge of the total number of chunks in the data set is **essential** in the sampling process.
+> the overall size of the data set can be computed by a standard traversal of the file system
+> e.g., the unix $du$ command
+
+5. This paper mentions how to sample the file with variable-size chunking:
+> the sampling should choose exact offsets in the file, and then choose the chunk which contains this offset.
> this can relieve the need to read entire file
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Dedup-Estmation/Rangoli-SYSTOR'13.md b/StoragePaperNote/Deduplication/Dedup-Estmation/Rangoli-SYSTOR'13.md
old mode 100644
new mode 100755
index f1fdafa..28a18f8
--- a/StoragePaperNote/Deduplication/Dedup-Estmation/Rangoli-SYSTOR'13.md
+++ b/StoragePaperNote/Deduplication/Dedup-Estmation/Rangoli-SYSTOR'13.md
@@ -1,92 +1,92 @@
----
-typora-copy-images-to: ../paper_figure
----
-Rangoli: Space management in deduplication environments
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SYSTOR'13 | Space Management |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-In deduped volumes there is no direct relation between the *the logical size* of the file and *the physical space* occupied by it.
-> hard to find an optimal space reclamation
-> Space reclamation in non-deduped environments is simpler. (guarantee changes in the used space of the volume by an amount equal to the logical size of the file)
-
-In this work, it proposes a fast and efficient tool which can identify the optimal set of files for space reclamation in a deduped environment.
-
-- Two dimensions
-1. Source centric:
-select groups of files at source that have a high degree of disk sharing
-> Migrate them together to the new destination (storage efficiency preservation)
-
-2. Destination aware:
-Pick files at source that potentially have maximum duplicate data at some destination volume
-
-In this paper, it only considers the *source centric* dimension, and the destination is agnostic.
-
-
-### Rangoli
-- Key idea:
-migrating similar files is better for preserving storage efficiency.
-> seek to partition the dataset such that most the data sharing between file within the same partition.
-> files across partitions have little or no data sharing.
-
-
-- Metric
-1. Space Reclamation (SR): for the source volume, the difference in the total used physical space
-2. Cost of Migration (CM): the number of blocks transmitted over the network.
-3. Migration Utility (MU): $\frac{SR}{CM}$ (*higher is better*)
-4. Physical space bloat (PSB): the ratio of increase in the physical space consumption of the dataset to its original space consumption. (*lower is better*)
-
-- Main algorithm
-1. Step 1: FPDB processing
-process the fingerprint database and compute the extent of data sharing across files
-> represent it as a bipartite graph.
-
-In its FPDB, it stores such that there are multiple records with the same fp. Thus, it can achieve its goal via traversing the FPDB.
-> it contains one fingerprint record for every **logical block** of the file.
-
-2. Step 2: Migration binning:
-partition the graph to obtain $K$ migration bins
-> space reclamation is $\frac{1}{K}$ of the volume space. (each migration bin is approximately equal in size)
-
-3. Step 3: Qualification of migration bins
-compute the metrics for each migration bin and chose the best among them.
-> 1. Logical size of a bin $p$:
-> 2. Internal sharing of a bin $b$: denote the extent of data sharing of within the bin
-> 3. Sharing Across of a $bin$: denote the extent of data sharing of the bin $p$ with the remainder of the dataset.
-
-### Implementation and Evaluation
-- Evaluation
-Datasets:
-> four datasets: Debian, HomeDir, VMDK, EngWeb
-
-- Evaluation objectives:
-1. flexibility
-2. impact on space consumption
-3. network costs
-4. scalability
-
-
-## 2. Strength (Contributions of the paper)
-1. a novel solution for space reclamation in deduped environments
-> fast and scalable and tested on real world dataset.
-
-2. a deterministic solution to report the exact metrics **before the actual migration**
-> find the exact space reclamation and associated penalties (e.g., network cost, physical space consumption)
-
-3. investigate how to find optimal datasets for space reclamation
-> better than alternatives based on MinHash
-
-## 3. Weakness (Limitations of the paper)
-1. From my perspective, this algorithm can only fit the NetApp deduplication system since its special design of FPDB.
-
-
-## 4. Some Insights (Future work)
-1. we can consider the similarity indicative hashes to repesent the similarity of the data
-> min-hash, minimum hash
-
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Rangoli: Space management in deduplication environments
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| SYSTOR'13 | Space Management |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+In deduped volumes there is no direct relation between the *the logical size* of the file and *the physical space* occupied by it.
+> hard to find an optimal space reclamation
+> Space reclamation in non-deduped environments is simpler. (guarantee changes in the used space of the volume by an amount equal to the logical size of the file)
+
+In this work, it proposes a fast and efficient tool which can identify the optimal set of files for space reclamation in a deduped environment.
+
+- Two dimensions
+1. Source centric:
+select groups of files at source that have a high degree of disk sharing
+> Migrate them together to the new destination (storage efficiency preservation)
+
+2. Destination aware:
+Pick files at source that potentially have maximum duplicate data at some destination volume
+
+In this paper, it only considers the *source centric* dimension, and the destination is agnostic.
+
+
+### Rangoli
+- Key idea:
+migrating similar files is better for preserving storage efficiency.
+> seek to partition the dataset such that most the data sharing between file within the same partition.
+> files across partitions have little or no data sharing.
+
+
+- Metric
+1. Space Reclamation (SR): for the source volume, the difference in the total used physical space
+2. Cost of Migration (CM): the number of blocks transmitted over the network.
+3. Migration Utility (MU): $\frac{SR}{CM}$ (*higher is better*)
+4. Physical space bloat (PSB): the ratio of increase in the physical space consumption of the dataset to its original space consumption. (*lower is better*)
+
+- Main algorithm
+1. Step 1: FPDB processing
+process the fingerprint database and compute the extent of data sharing across files
+> represent it as a bipartite graph.
+
+In its FPDB, it stores such that there are multiple records with the same fp. Thus, it can achieve its goal via traversing the FPDB.
+> it contains one fingerprint record for every **logical block** of the file.
+
+2. Step 2: Migration binning:
+partition the graph to obtain $K$ migration bins
+> space reclamation is $\frac{1}{K}$ of the volume space. (each migration bin is approximately equal in size)
+
+3. Step 3: Qualification of migration bins
+compute the metrics for each migration bin and chose the best among them.
+> 1. Logical size of a bin $p$:
+> 2. Internal sharing of a bin $b$: denote the extent of data sharing of within the bin
+> 3. Sharing Across of a $bin$: denote the extent of data sharing of the bin $p$ with the remainder of the dataset.
+
+### Implementation and Evaluation
+- Evaluation
+Datasets:
+> four datasets: Debian, HomeDir, VMDK, EngWeb
+
+- Evaluation objectives:
+1. flexibility
+2. impact on space consumption
+3. network costs
+4. scalability
+
+
+## 2. Strength (Contributions of the paper)
+1. a novel solution for space reclamation in deduped environments
+> fast and scalable and tested on real world dataset.
+
+2. a deterministic solution to report the exact metrics **before the actual migration**
+> find the exact space reclamation and associated penalties (e.g., network cost, physical space consumption)
+
+3. investigate how to find optimal datasets for space reclamation
+> better than alternatives based on MinHash
+
+## 3. Weakness (Limitations of the paper)
+1. From my perspective, this algorithm can only fit the NetApp deduplication system since its special design of FPDB.
+
+
+## 4. Some Insights (Future work)
+1. we can consider the similarity indicative hashes to repesent the similarity of the data
+> min-hash, minimum hash
+
+
diff --git a/StoragePaperNote/Deduplication/Dedup-Estmation/SketchDeduplication-FAST'19.md b/StoragePaperNote/Deduplication/Dedup-Estmation/SketchDeduplication-FAST'19.md
old mode 100644
new mode 100755
index 70d98f7..10d8711
--- a/StoragePaperNote/Deduplication/Dedup-Estmation/SketchDeduplication-FAST'19.md
+++ b/StoragePaperNote/Deduplication/Dedup-Estmation/SketchDeduplication-FAST'19.md
@@ -1,118 +1,118 @@
----
-typora-copy-images-to: paper_figure
----
-Sketching Volume Capacities in Deduplicated Storage
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| FAST'19 | Sketch+Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This work focuses on technologies and tools for **managing storage capacities** in storage system with deduplication.
-> analyzing capacities in deduplicated storage environment
-> reclaimable capacity, attributed capacity
-
-The key issus: once deduplication is brought in to the equation, the capacity of a volume is no longer a known quantity.
-> for any single volume or any combination of volumes
-
-This work addresses gaps for reporting and management of data that has already been deduplicated, which prior works do not address.
-
-### Volume Sketch
-- Main idea:
-the decision to forgo the attempt to produce accurate statistics
-
-This paper borrow techniques from the realm of streaming algorithms (sketch)
-> the metadata of each volume is sampled using a **content-based** sampling technique to produce a **capacity sketch** of the volume.
-> key property: sketch is much smaller than the actual metadata, yet contains enough inoformation to evaluate the volumes capacity properties.
-
-
-- Capacity sketches
-1. choose samples of the metadata according to the respective data content.
-> for each data chunk, examine its fingerprint (the hash), and include it in the sketch only if it contains $k$ leading zeros for a parameter $k$
-> Sample ratio: $\frac{1}{2^k}$, also called **sketch factor**.
-> this is the tradeoff: the required resources to handle the sketches vs. the accuracy which they provide.
-
-$$
-\hat{Space} = 2^k \sum_{h \in Sketch}(CompRatio(h) \times ChunkSize)
-$$
-$$
-ReduceRatio = \frac{\hat{Space}}{Written_S}
-$$
-
-2. using sketches for data inside a deduplication system
-Maintain at all time a **full system** sketch
-> representing all of the data in the system
-
-It also collects further parameters in the sketch
-> Reference count: the number of times the data chunk with fingerprint $h$ was written in the data set
-> Physical count: the number of physical copies stored during writes to the data set $S$.
-
-- Attributed capacity and data reduction ratios
-Those information can let the adminstrator to understand the data reducation properties of volumes.
-> how much is a volume involved in deduplication
-> **attributed capacity**: a breakdown of a volume of its space savings to deduplication and compression.
-> For example: if a data chunk reference is 3, 2 originating from volume $A$ and one from volume $B$, then the space is split in a $\frac{2}{3}$ and $\frac{1}{3}$ fashion between volume $A$ and $B$ respectively.
-
-- Accuracy guarantee
-1. Chernoff Bound
-2. Bernoulli variables
-
-Very similar to the MSST'12's work
-
-
-- System architecture
-
-
-pull the sketch data out of the storage system onto an **adjacent management server** where the sketch data is analyzed.
-> this avoid using CPU and memory resources in the storage that could otherwise be spent on serving I/O requests.
-
-1. All sketch data is held in memory at all times. (allow to retrieve the data swiftly)
-2. Each process is in charge of serving IOs for slices of the entire virtual space.
-> each slice has its own sketch which is maintained by the owning process.
-
-3. The sketch portrays the state of a slice at a point in time.
-4. A central process contacts all processes and retrieves their respective sketch data.
-
-### Implementation and Evaluation
-- Architecture and implementation
-1. Sketch analyzer
-> **Ingest phase**: for each volume, it collects all of its relevant hashes while merging and aggregating multiple appearances of hashes.
-> 1. a full system hash table: all of the hashes seen in the full system
-> 2. volume level structures: A B-Tree for each volume in the system which aggregates the hashes (only store a pointer to the entry in the full table)
-
-
-- Evaluation
-Dataset:
-> 1. Synthetic data: using VDBench benchmarking suite
-> 2. UBC data traces
-> 3. Production data in the field
-
-1. Ingest and analyze performance
-2. Group query performance
-3. Estimation accuracy
-
-
-
-
-## 2. Strength (Contributions of the paper)
-1. This work enables to query reclaimable capacities and attributed capacities for any volume in the system
-2. It can also answer how much physical space such a volume/group would consume if it were to be migrated to another deduplicated storage system.
-3. It also discusses the accuracy guarantees of the sketch method.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. This paper mentions that there a number of reasons for a chunk to have more than a single physical copy in a system.
-> deduplication opportunities that were not identified
-> the choice to forgo a deduplication opportunity for avoiding extensive data fragmentation
-
-2. This paper mentions that the sketch provides a fuzzy state, when it actually obtains the sketch for the last slice, the sketch in the system for the early slices might have changed.
-> This is an inaccuracy that the storage systems are dynamic and it cannot expect to freeze them at a specific state.
-> It needs to reduce the time window in which the sketches are extracted.
-
-3. This paper mentions that under the randomness of SHA-1, a false collision of 8 bytes in the sketch data would occur on average once on every 256PB
-
-4. For sharding, due to the randomness of the hash function, it is expected that each such partition will receive a fair share of the load.
+---
+typora-copy-images-to: paper_figure
+---
+Sketching Volume Capacities in Deduplicated Storage
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| FAST'19 | Sketch+Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This work focuses on technologies and tools for **managing storage capacities** in storage system with deduplication.
+> analyzing capacities in deduplicated storage environment
+> reclaimable capacity, attributed capacity
+
+The key issus: once deduplication is brought in to the equation, the capacity of a volume is no longer a known quantity.
+> for any single volume or any combination of volumes
+
+This work addresses gaps for reporting and management of data that has already been deduplicated, which prior works do not address.
+
+### Volume Sketch
+- Main idea:
+the decision to forgo the attempt to produce accurate statistics
+
+This paper borrow techniques from the realm of streaming algorithms (sketch)
+> the metadata of each volume is sampled using a **content-based** sampling technique to produce a **capacity sketch** of the volume.
+> key property: sketch is much smaller than the actual metadata, yet contains enough inoformation to evaluate the volumes capacity properties.
+
+
+- Capacity sketches
+1. choose samples of the metadata according to the respective data content.
+> for each data chunk, examine its fingerprint (the hash), and include it in the sketch only if it contains $k$ leading zeros for a parameter $k$
+> Sample ratio: $\frac{1}{2^k}$, also called **sketch factor**.
+> this is the tradeoff: the required resources to handle the sketches vs. the accuracy which they provide.
+
+$$
+\hat{Space} = 2^k \sum_{h \in Sketch}(CompRatio(h) \times ChunkSize)
+$$
+$$
+ReduceRatio = \frac{\hat{Space}}{Written_S}
+$$
+
+2. using sketches for data inside a deduplication system
+Maintain at all time a **full system** sketch
+> representing all of the data in the system
+
+It also collects further parameters in the sketch
+> Reference count: the number of times the data chunk with fingerprint $h$ was written in the data set
+> Physical count: the number of physical copies stored during writes to the data set $S$.
+
+- Attributed capacity and data reduction ratios
+Those information can let the adminstrator to understand the data reducation properties of volumes.
+> how much is a volume involved in deduplication
+> **attributed capacity**: a breakdown of a volume of its space savings to deduplication and compression.
+> For example: if a data chunk reference is 3, 2 originating from volume $A$ and one from volume $B$, then the space is split in a $\frac{2}{3}$ and $\frac{1}{3}$ fashion between volume $A$ and $B$ respectively.
+
+- Accuracy guarantee
+1. Chernoff Bound
+2. Bernoulli variables
+
+Very similar to the MSST'12's work
+
+
+- System architecture
+
+
+pull the sketch data out of the storage system onto an **adjacent management server** where the sketch data is analyzed.
+> this avoid using CPU and memory resources in the storage that could otherwise be spent on serving I/O requests.
+
+1. All sketch data is held in memory at all times. (allow to retrieve the data swiftly)
+2. Each process is in charge of serving IOs for slices of the entire virtual space.
+> each slice has its own sketch which is maintained by the owning process.
+
+3. The sketch portrays the state of a slice at a point in time.
+4. A central process contacts all processes and retrieves their respective sketch data.
+
+### Implementation and Evaluation
+- Architecture and implementation
+1. Sketch analyzer
+> **Ingest phase**: for each volume, it collects all of its relevant hashes while merging and aggregating multiple appearances of hashes.
+> 1. a full system hash table: all of the hashes seen in the full system
+> 2. volume level structures: A B-Tree for each volume in the system which aggregates the hashes (only store a pointer to the entry in the full table)
+
+
+- Evaluation
+Dataset:
+> 1. Synthetic data: using VDBench benchmarking suite
+> 2. UBC data traces
+> 3. Production data in the field
+
+1. Ingest and analyze performance
+2. Group query performance
+3. Estimation accuracy
+
+
+
+
+## 2. Strength (Contributions of the paper)
+1. This work enables to query reclaimable capacities and attributed capacities for any volume in the system
+2. It can also answer how much physical space such a volume/group would consume if it were to be migrated to another deduplicated storage system.
+3. It also discusses the accuracy guarantees of the sketch method.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. This paper mentions that there a number of reasons for a chunk to have more than a single physical copy in a system.
+> deduplication opportunities that were not identified
+> the choice to forgo a deduplication opportunity for avoiding extensive data fragmentation
+
+2. This paper mentions that the sketch provides a fuzzy state, when it actually obtains the sketch for the last slice, the sketch in the system for the early slices might have changed.
+> This is an inaccuracy that the storage systems are dynamic and it cannot expect to freeze them at a specific state.
+> It needs to reduce the time window in which the sketches are extracted.
+
+3. This paper mentions that under the randomness of SHA-1, a false collision of 8 bytes in the sketch data would occur on average once on every 256PB
+
+4. For sharding, due to the randomness of the hash function, it is expected that each such partition will receive a fair share of the load.
> sketch distribution and concurrency
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Dedup-Estmation/UnseenDeduplication-FAST'16.md b/StoragePaperNote/Deduplication/Dedup-Estmation/UnseenDeduplication-FAST'16.md
old mode 100644
new mode 100755
index 47a0db3..d1e2a31
--- a/StoragePaperNote/Deduplication/Dedup-Estmation/UnseenDeduplication-FAST'16.md
+++ b/StoragePaperNote/Deduplication/Dedup-Estmation/UnseenDeduplication-FAST'16.md
@@ -1,111 +1,111 @@
----
-typora-copy-images-to: paper_figure
----
-Estimating Unseen Deduplication - from Theory to Practice
-------------------------------------------
-| Venue | Category |
-| :-----: | :--------------------: |
-| FAST'16 | Sample + Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper intends to solve the problem of deduplication estimation, that is given a large dataset and try to understand the potential benfit from deduplication.
-> Potential customers need this information in order to make informed decisions on whether high-end storage with deduplication is worthwhile for them.
-
-The difficulty stems of deduplication estimation: deduplication is a **global** property
-> naive way: need to search across large amounts of data.
-> read a large fraction of the data from disk.
-
-State-of-art countermeasures:
-1. vague estimation based on prior knowledge on workload
-> is highly inaccurate in reality (varying from a long range)
-
-2. Scan the entire dataset
-> requires a large amount of memory and disk operations
-
-The main challenges are to make deduplication algorithms actually applicable and worthwhile in a real world scenario.
-
-### Estimating Unseen Deduplication
-In this work, they study the ability to estimate deduplication while not reading the entire dataset.
-
-- The key questions in this method:
-> 1. understanding the estimation accuracy: what sample size is enough to achieve a good accuracy?
-> 2. how to run the estimation algorithm with low memory consummption
-> 3. how to combine deduplication and compression
-
-- In this work, it terms a key concept called **Duplication Frequency Histogram**
-Based on DFH, it designs the unseen algorithm to estimate the DFH of a dataset
-> from the DFH, it devises an estimation of the deduplication.
-> Can take a DHF of a full set or a sample of the set
-
-
-
-- The Unseen Algorithm
-1. Input: the observed DFH $y$ of the observed sample $S_p$ ($S_p$ determines the expected matrix transformation)
-2. Process: find an estimation $\hat{x}$ of the DFH of the entire dataset $S$, which hash **minimal distance** from y
-3. Output: dedupe ratio according to $\hat{x}$
-> Rationale: 
-
-4. distance: use a normalized $L1$ Norm
-
-- Gauging the Accuracy (Range Unseen algorithm)
-1. 5% to 15% is very sufficient for accurate estimation. (for a range)
-2. To solve the question that how to interpret the estimation result and when sample is enough
-> It proposes a new approach that can return **a range of plausible deduplication ratio** rather than a single estimation number
-> returns upper and lower bounds (the actual ratio lies inside the range)
->
-> 
-
-3. Add two linear programs to find $x_{min}$ with the minimal dedupe ratio, and $x_{max}$ with the maximal dedupe ratio.
-4. Need set a slackness parameter $\alpha$ (small $\rightarrow$ tighter estimation range)
-
-- Sample Approach
-Issue: when the sample rate is high, it needs to keep the duplication frequencies of all distinct elements in the sample $\rightarrow$ the memory overhead would be high (GB)
-
-To slove this, it proposes two approaches with around 10MBs of RAM:
-1. Base sample approach: add another process of estimation (increase slackness parameter)
-> Shortcoming: the dataset to be studied needs to be set in advance. (not dynamic)
-
-2. A streaming approach: using stream algorithm towards distinct elements evaluation with low memory.
-> Only the C chunks that have the highest hash values
-
-- Estimating Combined Compression and Deduplication
-Add the weight in DFH: compression weight.
-
-- How to sample in the read work?
-1. Sample requirement
-> 1. sample uniformly at random over the entire dataset
-> 2. without repetitions
-> 3. using low memory
-> 4. gradual sample
-> > sample a small precent, evaulate, then add more samples if needed
->
-> 5. faster than running a full scan (scan time dominates the running time)
-
-2. Sample Size vs. Sampling Time
-mitigate the drop off in short random reads in HDDs:
-> 1. sample chunks: using larger chunk sizes ($4KB \rightarrow 1MB$ )
-
-3. Sample Strategy
-Simple way: use a fast hash function to hash the chunk ID, and ensure the output range is [0, 1), use this output to ensure whether to add this chunk to the sample.
-> Not the cryptographic hash like SHA-1
-> To ensure the overhead of sample is negligible.
-
-
-
-### Implementation and Evaluation
-- Implementation
-The core techniques in Matlab, evaluation is based on various real life datasets
-
-
-## 2. Strength (Contributions of the paper)
-1. The intuition of this method is very simple, that instead of computing the convergent result, it just concerns the range of the deduplication ratio under a given sample rate. This is sufficient for practical propose, and can save the time of repetitions .
-2. It also considers the issue of memory overhead of sampling when the sample rate is relativly high (15%)
-## 3. Weakness (Limitations of the paper)
-1. For the range unseen algorithm, I think it is not very reasonable to replace the original unseen algorithm.
-
-2. In its evaluation part, it does not provide the part of deduplication performance. Just present the estimation range with different configuration.
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+Estimating Unseen Deduplication - from Theory to Practice
+------------------------------------------
+| Venue | Category |
+| :-----: | :--------------------: |
+| FAST'16 | Sample + Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper intends to solve the problem of deduplication estimation, that is given a large dataset and try to understand the potential benfit from deduplication.
+> Potential customers need this information in order to make informed decisions on whether high-end storage with deduplication is worthwhile for them.
+
+The difficulty stems of deduplication estimation: deduplication is a **global** property
+> naive way: need to search across large amounts of data.
+> read a large fraction of the data from disk.
+
+State-of-art countermeasures:
+1. vague estimation based on prior knowledge on workload
+> is highly inaccurate in reality (varying from a long range)
+
+2. Scan the entire dataset
+> requires a large amount of memory and disk operations
+
+The main challenges are to make deduplication algorithms actually applicable and worthwhile in a real world scenario.
+
+### Estimating Unseen Deduplication
+In this work, they study the ability to estimate deduplication while not reading the entire dataset.
+
+- The key questions in this method:
+> 1. understanding the estimation accuracy: what sample size is enough to achieve a good accuracy?
+> 2. how to run the estimation algorithm with low memory consummption
+> 3. how to combine deduplication and compression
+
+- In this work, it terms a key concept called **Duplication Frequency Histogram**
+Based on DFH, it designs the unseen algorithm to estimate the DFH of a dataset
+> from the DFH, it devises an estimation of the deduplication.
+> Can take a DHF of a full set or a sample of the set
+
+
+
+- The Unseen Algorithm
+1. Input: the observed DFH $y$ of the observed sample $S_p$ ($S_p$ determines the expected matrix transformation)
+2. Process: find an estimation $\hat{x}$ of the DFH of the entire dataset $S$, which hash **minimal distance** from y
+3. Output: dedupe ratio according to $\hat{x}$
+> Rationale: 
+
+4. distance: use a normalized $L1$ Norm
+
+- Gauging the Accuracy (Range Unseen algorithm)
+1. 5% to 15% is very sufficient for accurate estimation. (for a range)
+2. To solve the question that how to interpret the estimation result and when sample is enough
+> It proposes a new approach that can return **a range of plausible deduplication ratio** rather than a single estimation number
+> returns upper and lower bounds (the actual ratio lies inside the range)
+>
+> 
+
+3. Add two linear programs to find $x_{min}$ with the minimal dedupe ratio, and $x_{max}$ with the maximal dedupe ratio.
+4. Need set a slackness parameter $\alpha$ (small $\rightarrow$ tighter estimation range)
+
+- Sample Approach
+Issue: when the sample rate is high, it needs to keep the duplication frequencies of all distinct elements in the sample $\rightarrow$ the memory overhead would be high (GB)
+
+To slove this, it proposes two approaches with around 10MBs of RAM:
+1. Base sample approach: add another process of estimation (increase slackness parameter)
+> Shortcoming: the dataset to be studied needs to be set in advance. (not dynamic)
+
+2. A streaming approach: using stream algorithm towards distinct elements evaluation with low memory.
+> Only the C chunks that have the highest hash values
+
+- Estimating Combined Compression and Deduplication
+Add the weight in DFH: compression weight.
+
+- How to sample in the read work?
+1. Sample requirement
+> 1. sample uniformly at random over the entire dataset
+> 2. without repetitions
+> 3. using low memory
+> 4. gradual sample
+> > sample a small precent, evaulate, then add more samples if needed
+>
+> 5. faster than running a full scan (scan time dominates the running time)
+
+2. Sample Size vs. Sampling Time
+mitigate the drop off in short random reads in HDDs:
+> 1. sample chunks: using larger chunk sizes ($4KB \rightarrow 1MB$ )
+
+3. Sample Strategy
+Simple way: use a fast hash function to hash the chunk ID, and ensure the output range is [0, 1), use this output to ensure whether to add this chunk to the sample.
+> Not the cryptographic hash like SHA-1
+> To ensure the overhead of sample is negligible.
+
+
+
+### Implementation and Evaluation
+- Implementation
+The core techniques in Matlab, evaluation is based on various real life datasets
+
+
+## 2. Strength (Contributions of the paper)
+1. The intuition of this method is very simple, that instead of computing the convergent result, it just concerns the range of the deduplication ratio under a given sample rate. This is sufficient for practical propose, and can save the time of repetitions .
+2. It also considers the issue of memory overhead of sampling when the sample rate is relativly high (15%)
+## 3. Weakness (Limitations of the paper)
+1. For the range unseen algorithm, I think it is not very reasonable to replace the original unseen algorithm.
+
+2. In its evaluation part, it does not provide the part of deduplication performance. Just present the estimation range with different configuration.
+
+## 4. Future Works
This work gives me the insight that can use a sample of the workload to estimate the whole workload property. This can be used in many scenarios
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/CloudTier-ATC'19.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/CloudTier-ATC'19.md
old mode 100644
new mode 100755
index d90dbb6..19da046
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/CloudTier-ATC'19.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/CloudTier-ATC'19.md
@@ -1,144 +1,144 @@
----
-typora-copy-images-to: ../paper_figure
----
-Data Domain Cloud Tier: Backup here, backup there, deduplicated everywhere!
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| USENIX ATC'19 | Cloud Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Object storage in public and private clouds provides cost-effective, on-demand, always available storage.
-
-- Data Domain added a deduplicated **cloud tier** to its data protection appliances
-1. Its deduplication system consists of an **active tier**:
-> customers backup their primary data (typically retained for 30-90 days)
-
-2. It also contains a **cloud tier**:
-> selected backups are transitioned to cloud storage (retained long term 1-7 years)
-
-- Motivation
-there were many architectural changes necessary to support a cloud tier in a mature storage product.
-
-### Cloud Tier
-- Data domain cloud tier architecture
-
-
-
-- Active Tier Architecture
-
-> 1. File is represented by a **Merkle tree** with user data as variable sized chunk at the bottom level of the tree. (referred as *L0 chunks*)
-> 2. SHA-1 fingerprint of *L0 chunks* are grouped together at the next higher level of the tree to form chunks. (*L1 chunks*)
-> 3. the top of the tree is (*L6 chunks*)
-> 4. LP chunks: chunks above L0 as LP chunks
-> 5. If two files are exactly the same, they would have the same L6 fingerprint. (if two files only partially overlap in content, then some branches of the tree will be identical.)
-
-*Metadata and data separation*: L0-containers and LP-containers
-
-> the locality of L0 chunks is preserved which results in better read performance
-> Each container: data section (chunk), metadata section (fingerprints of chunks)
-
-- Cloud Tier Architecture
- Goal of cloud tier:
-
-> 1. use it as extra capacity
-> 2. use it for long term archival of selected data
-
-**Metadata container**: it refers a third type of container, which stores the metadata sections from multiple *L0 and LP-Containers*
-> the metadata section of containers are reading during deduplication and garbage collection, and require quick access, so Metadata-Containers are stored on the **local storage** as well as in **cloud storage**.
-> mirror the metadata
-
-**Main difference**: store critical cloud tier metadata on the local storage of the Data Domain system to improve performance and reduce cost.
-
-- Object Size (Container Size)
-it starts with 64KB objects, but evolved to large sizes in the range of 1-4MB for several reasons
-> larger objects result in less metadata overhead.
-> also decrease transaction costs as cloud storage providers charge per-object transaction costs.
-> terms **objects** and **containers** interchangeably.
-
-- Perfect hashing and Physical scanning for the cloud tier
-1. Perfect hashing
-perform a membership query by representing **a fixed key set**.
-> a perfect hash function is a collision-free mapping which maps a key to a unique position in a bit vector. (1 : 1 mapping)
-> a perfect hash function + a bit vector
-
-2. Physical scanning
-discuss how to walk the LP chunks of all or most of the files in the system.
-> how to traverse the Merkle tree
-> enumeration is done in a breadth first manner
-
-
-- Estimate freeable space
-1. traverse the merkle tree of selected files, mark the chunks in an in-memory perfect hash vector.
-2. walk all the remaining files, unmark the chunks referenced by these remaining files.
-3. the chunks which are still marked in the perfect hash vector are the chunks which are uniquely referenced by files selected.
-
-- Seeding
-a one time process to transfer a large amount of data from the active tier to a nearly empty cloud tier. (need to generate perfect hash vector)
-1. used for migration of large amount of data to the cloud.
-2. guarantees that all the L0 and LP chunks are transferred to the cloud before the file's location is changed in the namespace. (using perfect hash vector)
-
-- File Migration
-transfer a few files incrementally.
-> reduces the amount of data transferred to the cloud tier by performing a deduplication process relative to chunks already present in the cloud tier.
-> do not need to generate perfect hash vector, directly scan Merkle tree.
-
-- Garbage collection (GC)
-When customers expire backups, some chunks become unreferenced.
-> mark and sweep
-
-1. Active tier garbage collection
-after marking chunks live in the perfect hash vector, the sweep process walks the container set to copy live chunks from old containers into newer containers while deleting the old containers.
-
-2. Cloud tier garbage collection
-Since the L0 containers are not local and reading them from the cloud is expensive. It needs a way to do garbage collection without reading the L0-container from object storage and writing new L0-container to object storage.
-> need to implement new APIs in cloud providers.
-> delete a compression region in a cloud container when it is completely unreferenced instead of individual chunk.
-
-
-### Implementation and Evaluation
-- Evaluation
-1. Deployed system evaluation
-> GC analysis
-> Cleaning efficiency loss due to compression region cleaning
-
-2. Internal systems
-> Freeable space estimation
-> File migration and seeding performance
-> Garbage collection performance
-> File migration and restore from the cloud
-
-## 2. Strength (Contributions of the paper)
-1. propose a new algorithm to estimate the amount of space unique to a set of files.
-> builds upon a previous technique using **perfect hashes** and **sequential storage scans**.
-
-2. develop a bulk seeding algorithm that also uses perfect hashes to select the set of chunks to transfer.
-> transfer the unique content to the cloud tier to preserve the benefits of deduplication during transfer.
-
-
-3. design collection scheme for the cloud tier
-> handle the latency and financial cost of reading data from the cloud back to the on-premises appliance.
-
-
-## 3. Weakness (Limitations of the paper)
-1. For its cloud tier garbage collection, it needs to modify the internal APIs of the cloud providers.
-
-## 4. Some insights
-1. This paper also mentions it is hard to calculate free up space by migrating files to the cloud
-> needs an algorithm to estimate the amount of space unique to a set of files.
-
-2. Something related to security
-Data Domain appliance tends to be utilized by a single customer, who typically selects a single encryption key for all of the data
-> have not found customer demand for convergent encryption or stronger encryption requirements for cloud storage than on-premises storage.
-
-If multiple keys are selected, customers accept a potential loss in cross-dataset deduplication.
-
-3. How about disaster recovery?
-the main reason why the active tier and cloud tier have different deduplication domains. (each tier is a separate deduplication domain)
-> If an active tier is lost, the backup copies migrated to object storage can be recovered.
-
-4. The overhead of generating perfect hash
-In this paper, it mentions it is nearly 3 hours per 1PB of physical capacity.
+---
+typora-copy-images-to: ../paper_figure
+---
+Data Domain Cloud Tier: Backup here, backup there, deduplicated everywhere!
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| USENIX ATC'19 | Cloud Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Object storage in public and private clouds provides cost-effective, on-demand, always available storage.
+
+- Data Domain added a deduplicated **cloud tier** to its data protection appliances
+1. Its deduplication system consists of an **active tier**:
+> customers backup their primary data (typically retained for 30-90 days)
+
+2. It also contains a **cloud tier**:
+> selected backups are transitioned to cloud storage (retained long term 1-7 years)
+
+- Motivation
+there were many architectural changes necessary to support a cloud tier in a mature storage product.
+
+### Cloud Tier
+- Data domain cloud tier architecture
+
+
+
+- Active Tier Architecture
+
+> 1. File is represented by a **Merkle tree** with user data as variable sized chunk at the bottom level of the tree. (referred as *L0 chunks*)
+> 2. SHA-1 fingerprint of *L0 chunks* are grouped together at the next higher level of the tree to form chunks. (*L1 chunks*)
+> 3. the top of the tree is (*L6 chunks*)
+> 4. LP chunks: chunks above L0 as LP chunks
+> 5. If two files are exactly the same, they would have the same L6 fingerprint. (if two files only partially overlap in content, then some branches of the tree will be identical.)
+
+*Metadata and data separation*: L0-containers and LP-containers
+
+> the locality of L0 chunks is preserved which results in better read performance
+> Each container: data section (chunk), metadata section (fingerprints of chunks)
+
+- Cloud Tier Architecture
+ Goal of cloud tier:
+
+> 1. use it as extra capacity
+> 2. use it for long term archival of selected data
+
+**Metadata container**: it refers a third type of container, which stores the metadata sections from multiple *L0 and LP-Containers*
+> the metadata section of containers are reading during deduplication and garbage collection, and require quick access, so Metadata-Containers are stored on the **local storage** as well as in **cloud storage**.
+> mirror the metadata
+
+**Main difference**: store critical cloud tier metadata on the local storage of the Data Domain system to improve performance and reduce cost.
+
+- Object Size (Container Size)
+it starts with 64KB objects, but evolved to large sizes in the range of 1-4MB for several reasons
+> larger objects result in less metadata overhead.
+> also decrease transaction costs as cloud storage providers charge per-object transaction costs.
+> terms **objects** and **containers** interchangeably.
+
+- Perfect hashing and Physical scanning for the cloud tier
+1. Perfect hashing
+perform a membership query by representing **a fixed key set**.
+> a perfect hash function is a collision-free mapping which maps a key to a unique position in a bit vector. (1 : 1 mapping)
+> a perfect hash function + a bit vector
+
+2. Physical scanning
+discuss how to walk the LP chunks of all or most of the files in the system.
+> how to traverse the Merkle tree
+> enumeration is done in a breadth first manner
+
+
+- Estimate freeable space
+1. traverse the merkle tree of selected files, mark the chunks in an in-memory perfect hash vector.
+2. walk all the remaining files, unmark the chunks referenced by these remaining files.
+3. the chunks which are still marked in the perfect hash vector are the chunks which are uniquely referenced by files selected.
+
+- Seeding
+a one time process to transfer a large amount of data from the active tier to a nearly empty cloud tier. (need to generate perfect hash vector)
+1. used for migration of large amount of data to the cloud.
+2. guarantees that all the L0 and LP chunks are transferred to the cloud before the file's location is changed in the namespace. (using perfect hash vector)
+
+- File Migration
+transfer a few files incrementally.
+> reduces the amount of data transferred to the cloud tier by performing a deduplication process relative to chunks already present in the cloud tier.
+> do not need to generate perfect hash vector, directly scan Merkle tree.
+
+- Garbage collection (GC)
+When customers expire backups, some chunks become unreferenced.
+> mark and sweep
+
+1. Active tier garbage collection
+after marking chunks live in the perfect hash vector, the sweep process walks the container set to copy live chunks from old containers into newer containers while deleting the old containers.
+
+2. Cloud tier garbage collection
+Since the L0 containers are not local and reading them from the cloud is expensive. It needs a way to do garbage collection without reading the L0-container from object storage and writing new L0-container to object storage.
+> need to implement new APIs in cloud providers.
+> delete a compression region in a cloud container when it is completely unreferenced instead of individual chunk.
+
+
+### Implementation and Evaluation
+- Evaluation
+1. Deployed system evaluation
+> GC analysis
+> Cleaning efficiency loss due to compression region cleaning
+
+2. Internal systems
+> Freeable space estimation
+> File migration and seeding performance
+> Garbage collection performance
+> File migration and restore from the cloud
+
+## 2. Strength (Contributions of the paper)
+1. propose a new algorithm to estimate the amount of space unique to a set of files.
+> builds upon a previous technique using **perfect hashes** and **sequential storage scans**.
+
+2. develop a bulk seeding algorithm that also uses perfect hashes to select the set of chunks to transfer.
+> transfer the unique content to the cloud tier to preserve the benefits of deduplication during transfer.
+
+
+3. design collection scheme for the cloud tier
+> handle the latency and financial cost of reading data from the cloud back to the on-premises appliance.
+
+
+## 3. Weakness (Limitations of the paper)
+1. For its cloud tier garbage collection, it needs to modify the internal APIs of the cloud providers.
+
+## 4. Some insights
+1. This paper also mentions it is hard to calculate free up space by migrating files to the cloud
+> needs an algorithm to estimate the amount of space unique to a set of files.
+
+2. Something related to security
+Data Domain appliance tends to be utilized by a single customer, who typically selects a single encryption key for all of the data
+> have not found customer demand for convergent encryption or stronger encryption requirements for cloud storage than on-premises storage.
+
+If multiple keys are selected, customers accept a potential loss in cross-dataset deduplication.
+
+3. How about disaster recovery?
+the main reason why the active tier and cloud tier have different deduplication domains. (each tier is a separate deduplication domain)
+> If an active tier is lost, the backup copies migrated to object storage can be recovered.
+
+4. The overhead of generating perfect hash
+In this paper, it mentions it is nearly 3 hours per 1PB of physical capacity.
> suppose it is static fixed fingerprint set.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/Dedup-ATC'11.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/Dedup-ATC'11.md
old mode 100644
new mode 100755
index 072e839..43bc5e7
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/Dedup-ATC'11.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/Dedup-ATC'11.md
@@ -1,102 +1,102 @@
----
-typora-copy-images-to: ../paper_figure
----
-Building a High-performance Deduplication System
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ATC'11 | Deduplication System |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Effectiveness of a deduplication system
- - Deduplication efficiency
- how well the system can detect and share duplicate data units? (reduce the storage cost)
- - Scalability
- the ability to support large amounts of raw storage with consistent performance (reducing the total number of nodes)
- - Throughput
- the rate at which data can be transferred in and out the system (minimizing the length of a backup window)
-
-It is hard to achieve all those three goals.
-
-- The reference management problem
-the cost of reference management (upon addition and deletion of data) has become one of the biggest real-world bottlenecks.
-> take many hours per day.
-
-- The indexing problem
-Need to scale to high capacities, good indexing throughput, provide high duplicate detection rate.
-
-This paper presents a complete, **single-node** deduplication system that covers indexing, reference management, and end-to-end throughput optimization.
-
-
-### System Design
-- System architecture
-
-
-1. Container: raw data + a catalog which lists all FPs stored in the container
-2. Three-level hierarchy in File Manager (FM): files -> backup -> backup group
-> allowing the FM to perform coarse-granularity tracking of file/backup changes in the system.
-
-- Progressive Sampled Indexing
-Maintain a complete index is a difficult task since the index typically needs to be stored both in memory, for performance, and on disk, for durability.
-
-1. directly locatable objects:
-file chunk location information is stored with the file metadata, therefore removing the need to consult the index for the exact location of the file chunks.
-> file metadata size may increase
-
-2. sampled indexing
-It no longer need to maintain a full index. It can maintain a *sampled indexing*.
-> can determine the number of entries that need to be dropped.
-
-When a lookup operation hits on a sampled FP, it can locate the container it belongs to and pre-fetch all FPs from that container's metadata into a memory cache.
-
-3. progressive sampling
-using the progressive sampling based on the amount of storage used, as opposed to the maximum raw storage.
-
-- Grouped mark-and-sweep
-Original mark-and-sweep has poor scalability
-> because it needs to touch every file in the system.
-
-Key idea of grouped mark-and-sweep:
-avoid touching every file in the mark phase and every container in the sweep phase.
-> just detect the changed backup group instead of each file.
-> the file manager tracks the changes in each backup group
-> only containers used by groups that have deleted files need to be swept.
-
-- Client-server interaction
-Using full pipeline to achieve high throughput
-> can observe the length of message queue to adjust the number of worker threads.
-
-### Implementation and Evaluation
-- Evaluation
-1. dataset
-> synthetic data set: consists of multiple 3GB file, each with globally unique data chunks.
-> > stress the disk and the network systems as large amounts of data need to be transferred.
-> VM dataset
-
-2. throughput
-> backup throughput: index throughput, ssd indexing throughput, and end-to-end throughput, reference update throughput, restore
-
-3. deduplication efficiency
-
-
-## 2. Strength (Contributions of the paper)
-1. This paper shows the detail of implementing a index and cache index in the deduplication system.
-
-2. It also proposes progressive sampled indexing, grouped mark-and-sweep, and client-server interaction.
-
-3. this paper takes care of the issue that the how to improve the scalability issue in single-node deduplication system.
-
-4. Most other systems have provided good solutions for a subset of problems (three problems), usually excluding single-node scalability and reference management.
-
-## 3. Weakness (Limitations of the paper)
-1. In its grouped mark-and-sweep scheme, all the containers used by groups that have deleted files need to be swept, however some containers of them maybe unchanged.
-> incur the high overhead in the sweep phase.
-
-
-## 4. Some Insights (Future work)
-1. this paper mentions the power outages and data corruption are really not that rare in real deduplication system
-> need to consider the how to recover the deduplication metadata and data.
-
-2. In the restore operation, it mentions that using directly locatable objects allows it to perform restore without using the index, making the whole process very scalable.
+---
+typora-copy-images-to: ../paper_figure
+---
+Building a High-performance Deduplication System
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ATC'11 | Deduplication System |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Effectiveness of a deduplication system
+ - Deduplication efficiency
+ how well the system can detect and share duplicate data units? (reduce the storage cost)
+ - Scalability
+ the ability to support large amounts of raw storage with consistent performance (reducing the total number of nodes)
+ - Throughput
+ the rate at which data can be transferred in and out the system (minimizing the length of a backup window)
+
+It is hard to achieve all those three goals.
+
+- The reference management problem
+the cost of reference management (upon addition and deletion of data) has become one of the biggest real-world bottlenecks.
+> take many hours per day.
+
+- The indexing problem
+Need to scale to high capacities, good indexing throughput, provide high duplicate detection rate.
+
+This paper presents a complete, **single-node** deduplication system that covers indexing, reference management, and end-to-end throughput optimization.
+
+
+### System Design
+- System architecture
+
+
+1. Container: raw data + a catalog which lists all FPs stored in the container
+2. Three-level hierarchy in File Manager (FM): files -> backup -> backup group
+> allowing the FM to perform coarse-granularity tracking of file/backup changes in the system.
+
+- Progressive Sampled Indexing
+Maintain a complete index is a difficult task since the index typically needs to be stored both in memory, for performance, and on disk, for durability.
+
+1. directly locatable objects:
+file chunk location information is stored with the file metadata, therefore removing the need to consult the index for the exact location of the file chunks.
+> file metadata size may increase
+
+2. sampled indexing
+It no longer need to maintain a full index. It can maintain a *sampled indexing*.
+> can determine the number of entries that need to be dropped.
+
+When a lookup operation hits on a sampled FP, it can locate the container it belongs to and pre-fetch all FPs from that container's metadata into a memory cache.
+
+3. progressive sampling
+using the progressive sampling based on the amount of storage used, as opposed to the maximum raw storage.
+
+- Grouped mark-and-sweep
+Original mark-and-sweep has poor scalability
+> because it needs to touch every file in the system.
+
+Key idea of grouped mark-and-sweep:
+avoid touching every file in the mark phase and every container in the sweep phase.
+> just detect the changed backup group instead of each file.
+> the file manager tracks the changes in each backup group
+> only containers used by groups that have deleted files need to be swept.
+
+- Client-server interaction
+Using full pipeline to achieve high throughput
+> can observe the length of message queue to adjust the number of worker threads.
+
+### Implementation and Evaluation
+- Evaluation
+1. dataset
+> synthetic data set: consists of multiple 3GB file, each with globally unique data chunks.
+> > stress the disk and the network systems as large amounts of data need to be transferred.
+> VM dataset
+
+2. throughput
+> backup throughput: index throughput, ssd indexing throughput, and end-to-end throughput, reference update throughput, restore
+
+3. deduplication efficiency
+
+
+## 2. Strength (Contributions of the paper)
+1. This paper shows the detail of implementing a index and cache index in the deduplication system.
+
+2. It also proposes progressive sampled indexing, grouped mark-and-sweep, and client-server interaction.
+
+3. this paper takes care of the issue that the how to improve the scalability issue in single-node deduplication system.
+
+4. Most other systems have provided good solutions for a subset of problems (three problems), usually excluding single-node scalability and reference management.
+
+## 3. Weakness (Limitations of the paper)
+1. In its grouped mark-and-sweep scheme, all the containers used by groups that have deleted files need to be swept, however some containers of them maybe unchanged.
+> incur the high overhead in the sweep phase.
+
+
+## 4. Some Insights (Future work)
+1. this paper mentions the power outages and data corruption are really not that rare in real deduplication system
+> need to consider the how to recover the deduplication metadata and data.
+
+2. In the restore operation, it mentions that using directly locatable objects allows it to perform restore without using the index, making the whole process very scalable.
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/DedupDesignTradeoff-FAST'15.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/DedupDesignTradeoff-FAST'15.md
new file mode 100755
index 0000000..1eb1fe6
--- /dev/null
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/DedupDesignTradeoff-FAST'15.md
@@ -0,0 +1,108 @@
+---
+typora-copy-images-to: paper_figure
+---
+Design Tradeoffs for Data Deduplication Performance in Backup Workloads
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'15 | Data Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - In order to understand the fundamental tradeoffs in each of its design choices
+ - disassemble data deduplication into a large *N-dimensional parameter space*
+ - Parameter space (design parameter)
+ - backup and restore performance
+ - prefetching and caching
+ - rewriting
+ - memory footprint
+ - storage cost
+ - Goal
+ - make efficient design decisions according to the desire tradeoff.
+ - Present a general-purpose *Deduplication Framework*
+ - DeFrame: for comprehensive data deduplication evaluation
+
+### DeFrame
+- Inline data deduplication space
+ - the fingerprint index
+ - the recipe store
+ - the container store
+ - a fingerprint cache (hold popular fingerprints)
+
+- Fingerprint index
+ - a well-recognized performance bottleneck (a large-scale deduplication system)
+ - putting all fingerprints in DRAM is cost-efficient
+ - Two submodules:
+ - a key-value store
+ - a fingerprint prefetching/caching module
+
+- Classification
+ - Exact deduplication / Near-exact deduplication
+ - Prefetching policy
+ - Logical locality (LL): for the *recipe*, point to segment
+ - Physical locality (PL): for the *container*, point to container ID
+ - 
+
+- Exact + Prefetching
+ - avoid a large fraction of lookup requests to the key-value store.
+ - the fragmentation problem would reduce the efficiency of the fingerprint prefetching and caching
+ - making the key-value store become **lookup-intensive over time**.
+
+- Near-exact + sample
+ - to downsize the key-value store
+ - important to maintain a high deduplication ratio
+ - near-exact deduplication generally indicates a **cost increase**.
+
+- rewriting
+ - improve the physical locality, the lookup overhead of EDPL no longer increases over time.
+
+
+- DeFrame Architecture
+ - Container store: metadata section + data section
+ - Recipe store: associated container IDs without the need to consult the fingerprint index, add some indicators of segment boundaries.
+ - fingerprint index:
+ - in-DRAM hash table / a MySQL database paired with a Bloom filter
+ - Backup pipeline:
+ - Chunk, Hash, Dedup, Rewrite, Filter, Append
+ - Restore pipeline:
+ - Reading recipe, reading chunks, writing chunks
+
+### Implementation and Evaluation
+- Implementation
+ - C++
+ - Dataset: Kernal, VMDK, RDB
+
+- Metrics
+ - Deduplication ratio
+ - memory footprint
+ - storage cost
+ - lookup requests per GB
+
+- Findings
+ - fragmentation results in an ever-increasing lookup overhead for EDPL
+ - EDLL achieves sustained performance
+ - Consider the self-reference
+
+## 2. Strength (Contributions of the paper)
+
+1. provide an overview of the deduplication (all design parameters)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. It mentions that the uniform sampling achieves a significantly higher deduplication ratio.
+
+2. when we use logical locality, it needs extremely high update overhead
+> all fingerprints are updated with their new segment IDs in the key-value store.
+
+3. Although near-exact deduplication reduces the DRAM cost, it cannot reduce the total storage cost.
+
+4. Design decision
+> For lowest storage cost: EDLL is preferred (highest deduplication ratio, sustained high backup performance)
+> For low memory footprint: ND is preferred,
+> > NDPL: for its simpleness
+> > NDLL: better deduplication ratio
+> For a sustained high restore performance:
+> > EDPL + rewriting
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/DiskBottleneck-FAST'08.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/DiskBottleneck-FAST'08.md
old mode 100644
new mode 100755
index f564d35..97eaed6
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/DiskBottleneck-FAST'08.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/DiskBottleneck-FAST'08.md
@@ -1,101 +1,101 @@
----
-typora-copy-images-to: paper_figure
----
-Avoiding the Disk Bottleneck in the Data Domain Deduplication File System
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| FAST'08 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper wants to investigate how to do deduplication at **high speed** in order to meet the performance requirement for secondary storage used for data protection.
-> how to implement a high-throughput identical chunk deduplication storage system at low system cost
-> reduce disk I/Os in high-throughput deduplication storage system.
-
-Traditional method: maintain an on-disk index of segment (chunk) and use a cache to accelerate segment index access
-> the cache hit ratios is low: fingerprint values are random, no spatial locality, very little temporal locality.
-> cause the real performance of deduplication cannot compete with **high-end** tape libraries. (i.e., 100MB/sec)
-
-### Reduce the need for on-disk index lookups during the deduplicaiton process
-- Key idea:
-1. a Bloom filter, which it calls a Summary Vector, to test if a data segment is new to the system.
-> avoid wasted lookups for unexisted segments.
-
-2. Stream-Informed Segment Layout (SISL): store data segments and their fingerprints in the same order that they occur in a data file or stream
-> creates spatial locality
-
-3. Locality Preserved Caching: fetch and cache groups of segment fingerprints that are likely to be accessed together.
-
-- Performance-Capacity Balance
-Given a particular storage capacity, backup policy, and deduplication efficiency, it is possible to compute the throughput that the system must sustain to justify the capacity.
-
-- Storage System Architecture in Data Domain (DDFS)
-The stack architecture of Data Domain File System (DDFS):
-> 
-> **Content Store**: break a data stream into segments (chunking).
-> **Segment Store**: perform deduplication. (does the actual work of deduplication), also does the data compression to further compress the data size, and writes compressed results into containers supported by Container Manager.
-
-1. Content Store
-**Object**: an object is a linear sequence of client data bytes.
-> 1. **chunking**: byte range $\rightarrow$ variable-length segments
-> 2. **compute fingerprint**: SHA-1 hash
-> 3. **segment mapping**: builds the tree of segments that records the mapping between object byte ranges and segment descriptors.
-
-2. Segment Store
-> 1. **A database** of segments keyed by their segment descriptors.
-> 2. **Segment filtering**: key operation to deduplicated segments, may trigger disk I/Os.
-> 3. **Container packing**: adds segments to be stored to a container which is the unit of storage in the system. A container when fully packed, is appended to the Container Manager.
-> 4. **Segment indexing update**: segment index $\rightarrow$ the container holder.
-
-3. Container Manager
-> 1. allocating, deallocating, reading, writing and reliably storing containers.
-> 2. built on top of standard block storage
-
-- Acceleration Methods
-1. Summary vector
-it indicates whether the segment is in the index or not.
-> a bloom filter to summarize the existence information about $n$ fingerprints in the segment index.
-> choose the basic Bloom Filter for simplicity and efficient implementation.
-
-2. Stream informed segment layout (stream-aware)
-> enable locality preserved caching, create spatial locality.
-> **Segment duplicate locality**: when new data contains a duplicate segment $x$, there is high probability that other segments in its locale are duplicates of the neighbors of $x$.
-> stream abstraction: segregates the segments created for different objects, preserves the logical ordering of segments within the Content Store.
-> dedicated container holds segments for a single stream.
-
-**Benefits**:
-> 1. multiple segments of the same data stream are written to a container together, reduce disk I/Os, and achieve high read throughput.
-
-3. Locality preserved caching (LPC)
-maintain the segment cache by groups of fingerprints.
-> LPC will fetch the entire metadata section in a container, insert all fingerprints in the metadata section into the cache.
-> Intuition: base segments in this containers are likely to be checked against for future duplicate segments.
-> 
-
-- Whole workflow (write)
-for an incoming segment
-> 1. check to see if this segment is in the segment cache
-> 2. check the Summary Vector, and lookup the segment index for its containers ID, insert the metadata section of that container into the segment cache.
-
-### Implementation and Evaluation
-1. Trace Analysis: shows the deduplication storage system works well with the real world datasets.
-> cumulative total compression ratios increase as the system holds more backup data.
-
-2. I/O savings with summary vector and locality preserved caching
-> measure the number of disk reads for segment index lookups and locality prefetches.
-
-3. Thoughput (with synthetic datasets)
-> write throughput is more important than read throughput
-
-
-
-## 2. Strength (Contributions of the paper)
-1. The logic of this paper is very clear, it identifies the disk I/Os are the bottleneck of deduplication throughput. And, it proposes three methods to mitigate the effect of disk I/Os.
-2. The system described in this paper is very practical, and has been used at over 1,000 data centers.
-## 3. Weakness (Limitations of the paper)
-1. One limitation of this paper is it does not consider the case of restore operation in the long term retention.
-## 4. Future Works
-1. This work verifies that the system with content-based segmentation gets a lot of global compression.
+---
+typora-copy-images-to: paper_figure
+---
+Avoiding the Disk Bottleneck in the Data Domain Deduplication File System
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| FAST'08 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper wants to investigate how to do deduplication at **high speed** in order to meet the performance requirement for secondary storage used for data protection.
+> how to implement a high-throughput identical chunk deduplication storage system at low system cost
+> reduce disk I/Os in high-throughput deduplication storage system.
+
+Traditional method: maintain an on-disk index of segment (chunk) and use a cache to accelerate segment index access
+> the cache hit ratios is low: fingerprint values are random, no spatial locality, very little temporal locality.
+> cause the real performance of deduplication cannot compete with **high-end** tape libraries. (i.e., 100MB/sec)
+
+### Reduce the need for on-disk index lookups during the deduplicaiton process
+- Key idea:
+1. a Bloom filter, which it calls a Summary Vector, to test if a data segment is new to the system.
+> avoid wasted lookups for unexisted segments.
+
+2. Stream-Informed Segment Layout (SISL): store data segments and their fingerprints in the same order that they occur in a data file or stream
+> creates spatial locality
+
+3. Locality Preserved Caching: fetch and cache groups of segment fingerprints that are likely to be accessed together.
+
+- Performance-Capacity Balance
+Given a particular storage capacity, backup policy, and deduplication efficiency, it is possible to compute the throughput that the system must sustain to justify the capacity.
+
+- Storage System Architecture in Data Domain (DDFS)
+The stack architecture of Data Domain File System (DDFS):
+> 
+> **Content Store**: break a data stream into segments (chunking).
+> **Segment Store**: perform deduplication. (does the actual work of deduplication), also does the data compression to further compress the data size, and writes compressed results into containers supported by Container Manager.
+
+1. Content Store
+**Object**: an object is a linear sequence of client data bytes.
+> 1. **chunking**: byte range $\rightarrow$ variable-length segments
+> 2. **compute fingerprint**: SHA-1 hash
+> 3. **segment mapping**: builds the tree of segments that records the mapping between object byte ranges and segment descriptors.
+
+2. Segment Store
+> 1. **A database** of segments keyed by their segment descriptors.
+> 2. **Segment filtering**: key operation to deduplicated segments, may trigger disk I/Os.
+> 3. **Container packing**: adds segments to be stored to a container which is the unit of storage in the system. A container when fully packed, is appended to the Container Manager.
+> 4. **Segment indexing update**: segment index $\rightarrow$ the container holder.
+
+3. Container Manager
+> 1. allocating, deallocating, reading, writing and reliably storing containers.
+> 2. built on top of standard block storage
+
+- Acceleration Methods
+1. Summary vector
+it indicates whether the segment is in the index or not.
+> a bloom filter to summarize the existence information about $n$ fingerprints in the segment index.
+> choose the basic Bloom Filter for simplicity and efficient implementation.
+
+2. Stream informed segment layout (stream-aware)
+> enable locality preserved caching, create spatial locality.
+> **Segment duplicate locality**: when new data contains a duplicate segment $x$, there is high probability that other segments in its locale are duplicates of the neighbors of $x$.
+> stream abstraction: segregates the segments created for different objects, preserves the logical ordering of segments within the Content Store.
+> dedicated container holds segments for a single stream.
+
+**Benefits**:
+> 1. multiple segments of the same data stream are written to a container together, reduce disk I/Os, and achieve high read throughput.
+
+3. Locality preserved caching (LPC)
+maintain the segment cache by groups of fingerprints.
+> LPC will fetch the entire metadata section in a container, insert all fingerprints in the metadata section into the cache.
+> Intuition: base segments in this containers are likely to be checked against for future duplicate segments.
+> 
+
+- Whole workflow (write)
+for an incoming segment
+> 1. check to see if this segment is in the segment cache
+> 2. check the Summary Vector, and lookup the segment index for its containers ID, insert the metadata section of that container into the segment cache.
+
+### Implementation and Evaluation
+1. Trace Analysis: shows the deduplication storage system works well with the real world datasets.
+> cumulative total compression ratios increase as the system holds more backup data.
+
+2. I/O savings with summary vector and locality preserved caching
+> measure the number of disk reads for segment index lookups and locality prefetches.
+
+3. Thoughput (with synthetic datasets)
+> write throughput is more important than read throughput
+
+
+
+## 2. Strength (Contributions of the paper)
+1. The logic of this paper is very clear, it identifies the disk I/Os are the bottleneck of deduplication throughput. And, it proposes three methods to mitigate the effect of disk I/Os.
+2. The system described in this paper is very practical, and has been used at over 1,000 data centers.
+## 3. Weakness (Limitations of the paper)
+1. One limitation of this paper is it does not consider the case of restore operation in the long term retention.
+## 4. Future Works
+1. This work verifies that the system with content-based segmentation gets a lot of global compression.
2. This work also mentions fragmentation will become more svere for **long term retention**, and can reduce the effectiveness of caching.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/ExtremeBining-MASCOTS'09.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/ExtremeBining-MASCOTS'09.md
old mode 100644
new mode 100755
index df0ce47..bcee566
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/ExtremeBining-MASCOTS'09.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/ExtremeBining-MASCOTS'09.md
@@ -1,125 +1,125 @@
----
-typora-copy-images-to: ../paper_figure
----
-Extreme Binning: Scalable, Parallel Deduplication for Chunk-based File Backup
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| MASCOTS'09 | Back up deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The performance of the entire backup operation depends on its backup throughput.
-> 1. Traditional backup workloads: consist of large data streams with high **locality**. (to provide reasonable throughput)
-> 2. Non-traditional backup workloads: made up of individual files with **no locality** among consecutive files in a given window of time.
-
-- Motivation
-a large scale inline deduplication faces the problem of **disk bottleback problem**.
-> faciliate fast chunk ID lookup, a single index containing the chunk IDs of all the backed up chunks must be maintained.
-
-This paper wants to solve this issue to service fine-grained low-locality backup workloads.
-> workload consists of files, instead of large data streams, that arrive in random order from disparate sources.
-> no locality between files that arrive in a given window of time.
-
-
-### Extreme Binning
-- Two tier index design
-1. splits up the chunk index into two tiers:
-> primary index: one representative chunk ID entry per file (reside in RAM)
-> second index: the rest of file's chunk IDs (in disk)
-
-
-
-2. the choice fo the representative chunk ID
-Based on Broder's theorem
-> the probability that the two sets have the same minimum hash element is the same as their Jaccard similarity coefficient
-> If two files are highly similar they share many chunks and hence their minimum chunk ID is the same with high probability
-
-Extreme binning chooses the minimum chunk ID of a file to be its representative chunk ID.
-
-
-
-It also records the hash of file for comparison.
-> by keeping the whole file hash in the primary index, avoid making a disk access for chunk lookup for most duplicate files.
-
-**The Rationale**:
-Extreme Binning groups together files that are highly similar to each other.
-> duplicate chunks are identified with high accuracy.
-> Only one bin is selected per file
-
-- Distributed scenario with multiple backup nodes
-1. In multiple backup nodes, the two-tier chunk index must first be partitioned and each partition allocated to a backup node.
-> distribute objects to maximize scalability and reliability.
-> A file can be chunked by one backup node and deduplicated by another node.
-
-2. Every entry in the primary index is examined to determine to which backup node
-> For example, $K$ backup nodes, chunk ID is $c_i$, $c_i \mod K$
-> When a primary index entry moves, the bin attached to it also moves to the same backup node, all the data chunks attached to the bin also move to the same backup node.
-
-This solution can make each bin independent, but if a chunk ID appears in two bins, there will be two copies of it corresponding data chunk.
-> make scale out operations clean and simple.
-
-3. It mentions that a set of master nodes can be installed to do the chunking
-> allow for maximum parallelization
-
-**Why do this?**
-System scale out does not affect deduplication. (stateless)
-
-### Implementation and Evaluation
-- Evaluation
-1. Dataset
-Two datasets:
-> HDup: a high number of duplicates on account of all the full backups.
-> LDup: incremental backup dataset, contains few duplicates.
-> Linux distributions
-
-Chunking method and hash function
-> TTTD and SHA-1
-
-2. Deduplication Efficiency
-shows that Extreme Binning yields excellent deduplication and that the overhead of extra storage space is small.
-
-3. Load distribution
-show Extreme Binning ensuring smooth scale out and preventing any node from becoming a bottleneck to the overall system performance.
-
-## 2. Strength (Contributions of the paper)
-1. Exploit file similarity instead of chunk locality, split the chunk into two tiers
-> 1. one tier is small enough to reside in RAM
-> 2. the second tier is kept on disk
-
-2. In a distributed setting (with multiple backup nodes)
-> 1. using a stateless routing algorithm to for every incoming files to allocate a single backup node.
-> 2. Each node manages its own index and data without sharing or knowing the contents of other backup nodes
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. Inline deduplication vs. Post-process deduplication
-data is deduplicated before it is written to disk
-> extra disk space is not required to hold and protect data yet to be backed up. (Data Domain, Hewlett Packard)
-> data is first written to a temporary staging area
-
-
-2. In this paper, this method also allows some duplicate chunks. This is uncontrollable.
-> it argues that this loss of deduplication is minimal for representative workloads (in practice)
-
-3. Extreme Binning represents a trade off between deduplication throughput and deduplication efficiency.
-
-4. Using distributed hash tables
-This paper mentions that: a flat chunk index could be partitioned like a DHT
-> by using a consistent hashing scheme to map every chunk ID to a partition. (Every partition can then by hosted by a dedicated compute node)
-> Autonomy of backup nodes is not possible in such a design
-
-5. The relationship between Sparse indexing and DDFS
-- Sparse indexing:
-design for data streams, chunks the stream into multiple megabyte segments (slightly sampled)
-> crucially dependent on chunk locality, poor chunk locality may produce unacceptably poor levels of deduplication for them.
-
-- DDFS
-relies heavily on inherent data locality for its cache to be effective to improve throughput
-> in-memory Bloom filter and caches index fragments
-> the lack of chunk locality renders the caching ineffectual
-
-- Extreme Binning
+---
+typora-copy-images-to: ../paper_figure
+---
+Extreme Binning: Scalable, Parallel Deduplication for Chunk-based File Backup
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| MASCOTS'09 | Back up deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The performance of the entire backup operation depends on its backup throughput.
+> 1. Traditional backup workloads: consist of large data streams with high **locality**. (to provide reasonable throughput)
+> 2. Non-traditional backup workloads: made up of individual files with **no locality** among consecutive files in a given window of time.
+
+- Motivation
+a large scale inline deduplication faces the problem of **disk bottleback problem**.
+> faciliate fast chunk ID lookup, a single index containing the chunk IDs of all the backed up chunks must be maintained.
+
+This paper wants to solve this issue to service fine-grained low-locality backup workloads.
+> workload consists of files, instead of large data streams, that arrive in random order from disparate sources.
+> no locality between files that arrive in a given window of time.
+
+
+### Extreme Binning
+- Two tier index design
+1. splits up the chunk index into two tiers:
+> primary index: one representative chunk ID entry per file (reside in RAM)
+> second index: the rest of file's chunk IDs (in disk)
+
+
+
+2. the choice fo the representative chunk ID
+Based on Broder's theorem
+> the probability that the two sets have the same minimum hash element is the same as their Jaccard similarity coefficient
+> If two files are highly similar they share many chunks and hence their minimum chunk ID is the same with high probability
+
+Extreme binning chooses the minimum chunk ID of a file to be its representative chunk ID.
+
+
+
+It also records the hash of file for comparison.
+> by keeping the whole file hash in the primary index, avoid making a disk access for chunk lookup for most duplicate files.
+
+**The Rationale**:
+Extreme Binning groups together files that are highly similar to each other.
+> duplicate chunks are identified with high accuracy.
+> Only one bin is selected per file
+
+- Distributed scenario with multiple backup nodes
+1. In multiple backup nodes, the two-tier chunk index must first be partitioned and each partition allocated to a backup node.
+> distribute objects to maximize scalability and reliability.
+> A file can be chunked by one backup node and deduplicated by another node.
+
+2. Every entry in the primary index is examined to determine to which backup node
+> For example, $K$ backup nodes, chunk ID is $c_i$, $c_i \mod K$
+> When a primary index entry moves, the bin attached to it also moves to the same backup node, all the data chunks attached to the bin also move to the same backup node.
+
+This solution can make each bin independent, but if a chunk ID appears in two bins, there will be two copies of it corresponding data chunk.
+> make scale out operations clean and simple.
+
+3. It mentions that a set of master nodes can be installed to do the chunking
+> allow for maximum parallelization
+
+**Why do this?**
+System scale out does not affect deduplication. (stateless)
+
+### Implementation and Evaluation
+- Evaluation
+1. Dataset
+Two datasets:
+> HDup: a high number of duplicates on account of all the full backups.
+> LDup: incremental backup dataset, contains few duplicates.
+> Linux distributions
+
+Chunking method and hash function
+> TTTD and SHA-1
+
+2. Deduplication Efficiency
+shows that Extreme Binning yields excellent deduplication and that the overhead of extra storage space is small.
+
+3. Load distribution
+show Extreme Binning ensuring smooth scale out and preventing any node from becoming a bottleneck to the overall system performance.
+
+## 2. Strength (Contributions of the paper)
+1. Exploit file similarity instead of chunk locality, split the chunk into two tiers
+> 1. one tier is small enough to reside in RAM
+> 2. the second tier is kept on disk
+
+2. In a distributed setting (with multiple backup nodes)
+> 1. using a stateless routing algorithm to for every incoming files to allocate a single backup node.
+> 2. Each node manages its own index and data without sharing or knowing the contents of other backup nodes
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. Inline deduplication vs. Post-process deduplication
+data is deduplicated before it is written to disk
+> extra disk space is not required to hold and protect data yet to be backed up. (Data Domain, Hewlett Packard)
+> data is first written to a temporary staging area
+
+
+2. In this paper, this method also allows some duplicate chunks. This is uncontrollable.
+> it argues that this loss of deduplication is minimal for representative workloads (in practice)
+
+3. Extreme Binning represents a trade off between deduplication throughput and deduplication efficiency.
+
+4. Using distributed hash tables
+This paper mentions that: a flat chunk index could be partitioned like a DHT
+> by using a consistent hashing scheme to map every chunk ID to a partition. (Every partition can then by hosted by a dedicated compute node)
+> Autonomy of backup nodes is not possible in such a design
+
+5. The relationship between Sparse indexing and DDFS
+- Sparse indexing:
+design for data streams, chunks the stream into multiple megabyte segments (slightly sampled)
+> crucially dependent on chunk locality, poor chunk locality may produce unacceptably poor levels of deduplication for them.
+
+- DDFS
+relies heavily on inherent data locality for its cache to be effective to improve throughput
+> in-memory Bloom filter and caches index fragments
+> the lack of chunk locality renders the caching ineffectual
+
+- Extreme Binning
From my perspective, it mainly targets on scalable and parallel deduplication with low locality.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/Redesigning-ATC'18.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/Redesigning-ATC'18.md
old mode 100644
new mode 100755
index 69dd1a9..fbeff29
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/Redesigning-ATC'18.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/Redesigning-ATC'18.md
@@ -1,139 +1,139 @@
----
-typora-copy-images-to: ../paper_figure
----
-Can't We All Get Along? Redesigning Protection Storage for Modern Workloads
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| USENIX ATC'18 | Redesign Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Considering the impact of evolving workloads on system performance on Data Domain File System (DDFS) as a whole.
-> 1. the impact of increasing numbers of small files
-> 2. higher deduplication ratios
-
-- Garbage collection (GC) was slowed by these changing workloads
-> need new algorithms
-
-A new requirement is particularly demanding
-> involves large amounts of **nonsequential I/O (NSIO)**
-
-- Therefore, this paper argues that they are at an inflection point.
-> need to rethink and redesign backup systems to enable **optimizaed performance for non-traditional data protection workloads with nonsequential accesses**.
-
-### New DDFS
-- Traditional workloads and Non-traditional workloads
-1. Traditional workloads: large sequential writes and reads with low metadata operations
-2. Nontraditional workloads: have many small files, more metadata operations, and frequent non-sequential accesses.
-
-
-- Deduplicating protection storage
-Each file is represented by a Merkle tree.
-
-- Key Improvement motivation
-Moving the index to SSDs was a necessary step to improve performance.
-
-- Changing environments
-1. Data format: traditionally, treat the backup as tar format (cannot restore an individual files inside of it)
-> a shift towards backing up individual files in their "native" format
-> lead to millions or billions of individual fils (metadata overhead)
-
-2. Unit of deduplication: need to consider the application knowledge
-> variable-size chunk or fixed-size chunk?
-> need to align the unit of deduplication appropriately.
-
-3. Devices: SSD caching
-> how to retrofit this to an existing disk-based data protection system.
-
-4. Mixed workloads: DDFS was originally designed to support **large sequential access** patterns
-> currently, need to support both sequential accesses and random accesses in the workload.
-
-- SSD caches
-1. Fingerprint Index Cache (FPI)
-fingerprint to container index (location in storage)
-> to support NSIO, the system keeps the **entire FPI** in SSD.
-> Since space is limited, it just store a **short hash** (4 bytes) rather than **long hash** (20 bytes)
-
-
-Short fingerprints would introduce the false positive (match is incorrect) (**can be detected**)
-> if the needed chunk is not found in the container referenced by the short fingerprint, then the full on-disk index is consulted. (latnecy is high, while this case is infrequency)
-
-Can receive more improvement **when the locality is bad**.
-
-2. File Metadata Cache (FMD)
-
-
-3. Chunk Data Cache
-The main difference:
-> 1. the locality of access within a contianer may be highly variable, and the reuse of specific data may by more commonplace.
-> 2. a data chunk might be written and then read, with a gap between the accesses that would be too large for the data to reside in a client or server DRAM cache.
-
-4. Directories Cache
-mapping file path to Merkle tree
-> 1. For large files such VM image, the lookup overhead is insignificant.
-> 2. But for modern workload where it includes many small files, the directory lookup becomes significant performance penalty.
-
-A full copy of directory manager is now cached in SSD for performance.
-> straightforward, DM is allocated 5% of the SSD cache.
-
-- File system modifications to support nonsequential workloads
-**Goal**: software optimizations remove unnecessary I/Os to disk
-1. Detecting workload type
-To decide whetehr NSIO processing is needed.
-> partition large files into regions and keeps a history of recent I/Os per region
-> if a new I/O is not within a threshold distance of one of the previous 16 I/Os, it is considerd nonsequential.
-
-
-2. QoS and throttling
-> split the backup and restore workload shares into sequential and nonsequential shares
-> when meet RPC timeouts, I/O throttling per workload (implement an edge throttling mechanism)
-
-3. selective fingerprint index queries
-it chooses to skip redundancy checks to improve performance
-> nonsequential writes tend to consist of unique content
-> disable query the fingerprint index for small nonsequential writes (<128KB)
-> Any duplicate chunks will be removed during periodic garbage collection
-
-4. Delay metadata update
-switch to fixed-sized chunks (for efficient updates of file recipes during nonsequential writes)
-> need to update the file recipe to reference new chunks (delay until sufficient updates have been accumulated)
-
-5. Direct read of compression regions
-change the mapping of fingerprints to container
-> to fingerprint to container and compression region index (reduce the index lookup I/O)
-> Reason: for NSIO, feature accesses are unlikely to remain within the same container.
-
-6. Adjust the chunk size to improve nonsequential writes
-Although variable-sized chunking has better deduplication, the performance gains achieve with fixed-size chunks outweighs the deduplication loss.
-> new use cases that have block-aligned writes
-
-### Implementation and Evaluation
-
-- Evaluation
-Focus on average IOPS and latency
-> 1. without SSD cache + without software optimization
-> 2. SSD cache metadata + without software optimization
-> 3. SSD cache data and metadata + without software optimization
-> 3. SSD cache data and metadata + software optimization
-
-## 2. Strength (Contributions of the paper)
-1. This paper extends support to modern backup applications with NSIO access patterns.
-> benefit both traditional and non-traditional backup and restore tasks
-
-2. propose some optimized technologies to better utilize the benefits of flash
-> selectively storing metadata on SSDs.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. This paper mentions the change of workload in deduplication system
-> NSIO and SIO, using SSDs for caching metadata
-> combining SSD caching with software optimizations throughout its system
-
-2. Main contribution: add SSD cache + software optimization
-
-3. DDFS has had to evolve to support not only traditional workloads (sequential backup worload)
-> support also newer non-sequential workloads. (direct access for reads and writes in place)
+---
+typora-copy-images-to: ../paper_figure
+---
+Can't We All Get Along? Redesigning Protection Storage for Modern Workloads
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| USENIX ATC'18 | Redesign Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Considering the impact of evolving workloads on system performance on Data Domain File System (DDFS) as a whole.
+> 1. the impact of increasing numbers of small files
+> 2. higher deduplication ratios
+
+- Garbage collection (GC) was slowed by these changing workloads
+> need new algorithms
+
+A new requirement is particularly demanding
+> involves large amounts of **nonsequential I/O (NSIO)**
+
+- Therefore, this paper argues that they are at an inflection point.
+> need to rethink and redesign backup systems to enable **optimizaed performance for non-traditional data protection workloads with nonsequential accesses**.
+
+### New DDFS
+- Traditional workloads and Non-traditional workloads
+1. Traditional workloads: large sequential writes and reads with low metadata operations
+2. Nontraditional workloads: have many small files, more metadata operations, and frequent non-sequential accesses.
+
+
+- Deduplicating protection storage
+Each file is represented by a Merkle tree.
+
+- Key Improvement motivation
+Moving the index to SSDs was a necessary step to improve performance.
+
+- Changing environments
+1. Data format: traditionally, treat the backup as tar format (cannot restore an individual files inside of it)
+> a shift towards backing up individual files in their "native" format
+> lead to millions or billions of individual fils (metadata overhead)
+
+2. Unit of deduplication: need to consider the application knowledge
+> variable-size chunk or fixed-size chunk?
+> need to align the unit of deduplication appropriately.
+
+3. Devices: SSD caching
+> how to retrofit this to an existing disk-based data protection system.
+
+4. Mixed workloads: DDFS was originally designed to support **large sequential access** patterns
+> currently, need to support both sequential accesses and random accesses in the workload.
+
+- SSD caches
+1. Fingerprint Index Cache (FPI)
+fingerprint to container index (location in storage)
+> to support NSIO, the system keeps the **entire FPI** in SSD.
+> Since space is limited, it just store a **short hash** (4 bytes) rather than **long hash** (20 bytes)
+
+
+Short fingerprints would introduce the false positive (match is incorrect) (**can be detected**)
+> if the needed chunk is not found in the container referenced by the short fingerprint, then the full on-disk index is consulted. (latnecy is high, while this case is infrequency)
+
+Can receive more improvement **when the locality is bad**.
+
+2. File Metadata Cache (FMD)
+
+
+3. Chunk Data Cache
+The main difference:
+> 1. the locality of access within a contianer may be highly variable, and the reuse of specific data may by more commonplace.
+> 2. a data chunk might be written and then read, with a gap between the accesses that would be too large for the data to reside in a client or server DRAM cache.
+
+4. Directories Cache
+mapping file path to Merkle tree
+> 1. For large files such VM image, the lookup overhead is insignificant.
+> 2. But for modern workload where it includes many small files, the directory lookup becomes significant performance penalty.
+
+A full copy of directory manager is now cached in SSD for performance.
+> straightforward, DM is allocated 5% of the SSD cache.
+
+- File system modifications to support nonsequential workloads
+**Goal**: software optimizations remove unnecessary I/Os to disk
+1. Detecting workload type
+To decide whetehr NSIO processing is needed.
+> partition large files into regions and keeps a history of recent I/Os per region
+> if a new I/O is not within a threshold distance of one of the previous 16 I/Os, it is considerd nonsequential.
+
+
+2. QoS and throttling
+> split the backup and restore workload shares into sequential and nonsequential shares
+> when meet RPC timeouts, I/O throttling per workload (implement an edge throttling mechanism)
+
+3. selective fingerprint index queries
+it chooses to skip redundancy checks to improve performance
+> nonsequential writes tend to consist of unique content
+> disable query the fingerprint index for small nonsequential writes (<128KB)
+> Any duplicate chunks will be removed during periodic garbage collection
+
+4. Delay metadata update
+switch to fixed-sized chunks (for efficient updates of file recipes during nonsequential writes)
+> need to update the file recipe to reference new chunks (delay until sufficient updates have been accumulated)
+
+5. Direct read of compression regions
+change the mapping of fingerprints to container
+> to fingerprint to container and compression region index (reduce the index lookup I/O)
+> Reason: for NSIO, feature accesses are unlikely to remain within the same container.
+
+6. Adjust the chunk size to improve nonsequential writes
+Although variable-sized chunking has better deduplication, the performance gains achieve with fixed-size chunks outweighs the deduplication loss.
+> new use cases that have block-aligned writes
+
+### Implementation and Evaluation
+
+- Evaluation
+Focus on average IOPS and latency
+> 1. without SSD cache + without software optimization
+> 2. SSD cache metadata + without software optimization
+> 3. SSD cache data and metadata + without software optimization
+> 3. SSD cache data and metadata + software optimization
+
+## 2. Strength (Contributions of the paper)
+1. This paper extends support to modern backup applications with NSIO access patterns.
+> benefit both traditional and non-traditional backup and restore tasks
+
+2. propose some optimized technologies to better utilize the benefits of flash
+> selectively storing metadata on SSDs.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. This paper mentions the change of workload in deduplication system
+> NSIO and SIO, using SSDs for caching metadata
+> combining SSD caching with software optimizations throughout its system
+
+2. Main contribution: add SSD cache + software optimization
+
+3. DDFS has had to evolve to support not only traditional workloads (sequential backup worload)
+> support also newer non-sequential workloads. (direct access for reads and writes in place)
> expect NSIO workloads to become more common as customers increase the frequency of backups.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/SparseIndex-FAST'09.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/SparseIndex-FAST'09.md
old mode 100644
new mode 100755
index 70c626f..31136bc
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/SparseIndex-FAST'09.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/SparseIndex-FAST'09.md
@@ -1,108 +1,108 @@
----
-typora-copy-images-to: ../paper_figure
----
-Sparse Indexing: Large Scale, Inline Deduplication Using Sampling and Locality
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'09 | Deduplication Index |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-1. This paper wants to solve the chunk-lookup disk bottleneck problem of large-scale backup that inline, chunk-based deduplication schemes face.
-> originally, it needs to index every chunk.
-> **chunk-lookup disk bottleneck problem**
-
-2. It concerns the scenario of inline-deduplication
-> data is deduplicated as it arrives rather than later in batch mode.
-> the backup data is a form of data stream (e.g., 400GB image)
-
-3. This paper solves this problem via **chunk locality**.
-> 1. the tendency for chunks in backup data streams to reoccur together
-> 2. For example: if the last time it encountered chunk $A$, it was surrounded by chunks $B$, $C$, and $D$, then next time it encounters $A$ (even for different backup), it is likely that it will also encounter $B$, $C$ or $D$ nearby.
-
-### Spare Index
-- Key observation
-If two pieces of backup streams share any chunks, they are likely to share many chunks.
-
-
-- Inline deduplication vs. out-of-line deuplication
-1. Inline deduplication: the data is deuplicated as it arrives and before it hits disk.
-2. Out-of-line deduplication: the data is first accumulated in an on-disk holding area and then deduplicated later in batch mode.
-
-The disadvantages of out-of-line deduplication:
->1. the need for an on-disk holding area large enough to hold an entire backup window's worth of raw data.
->2. need to implement a separate holding area.
->3. it is not possible to conserve network or disk bandwidth because every chunk must be written to the hold
-
-- Segmentation
-In its approach, a segment is the unit of storage and retrieval.
-> segment: a sequence of chunks, a few megabytes
-> two segments are similar: if they share a number of chunks.
-
-**Segment manifests**: segment recipe
-> allows reconstructing a segment from its chunks.
-> every stored segment has a manifest that is stored on disk.
-
-Segmentation method:
-> 1. Fixed-sized segments
-> 2. Variable-size segmentation: use the same trick at the chunking level to avoid the boundary-shifting problem, base the landmarks in the content.
-
-- Deduplication step
-Step 1: identify among all the segments in the store some that are most similar to the incoming segment
-> this paper calls them "champions"
-
-Step 2: deduplicate against those "champions" segments by finding the chunks they share with the incoming segment, which do not need to be stored again.
-
-
-
-
-- How to identify the similar segments
-sample the chunk hashes of the incoming segment, and use an in-RAM index to determine which already-stored segments contain how many of those hashes.
-> **sparse index**: maps the samples to the manifests in which they occur. (Need to set the limitation for the number of manifests that can be pointed by any one hook)
-> once it has chosen champions, it can load their manifests into RAM and use them to deduplicate the incoming segment.
-> Assuptions: chunk locality they are likely to share many other chunks with the incoming segment as well.
-> a **score scheme** to determine the champions (choose the manifest with highest non-zero score)
-
-- Deduplicating against the champions
-1. load the champion manifests from disk to the RAM
-> can use a small cache to speed this process since the adjacent segments sometimes have champions in common.
-
-### Implementation and Evaluation
-- Evaluation
-1. simulator based evaluation
-> input: a series of (chunk hash, length) pairs
-> divide into some segments, determines the champions for each segment, and then calculates the amount of deduplication obtained.
-
-2. setting
-chunk size: 4KB mean, which they find a good trade-off between maximizing deduplication and minimizing per-chunk overhead.
-3. dataset
-> Workgroup: a semi-regular series of backups of the desktop PCs of a group of 20 engineers. (3TB)
-> SMB: medium business server backedup to virtual tape, 0.6TB.
-
-4. Data Locality
-> Deduplicating each input segment against only 2 prior segments can sufficeto remove all but 1% of the duplicate chunks.
-> Larger segment size yield slightly less locality
-
-5. RAM Space usage
-Suppose the scenario of 100TB backup data, and compare the RAM usage with bloom filter.
-
-
-## 2. Strength (Contributions of the paper)
-1. the key strength of this paper is only maintaining the sparse index in RAM, which is much smaller than a full chunk index.
-> can be 128 times smaller than a full chunk index.
-> amortize the disk seek overhead of the thousands of chunks in each segment.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. this paper mentions the concept of chunk locality, and takes advantages of this property to adopt the segment level deduplication. But it does not mention how to measure chunk locality for a given workload?
-> without knowing the chunk locality, it is hard to decide the segment.
-
-
-2. In this paper, it uses a manifest to store the metadata of the segment, and put them in the disk.
-
-3. In this paper, it does not consider the client-side deduplication, the reason this paper argues
-> it needs the cost of some client-side processing
-> need to modify the legacy backup clients
+---
+typora-copy-images-to: ../paper_figure
+---
+Sparse Indexing: Large Scale, Inline Deduplication Using Sampling and Locality
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'09 | Deduplication Index |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+1. This paper wants to solve the chunk-lookup disk bottleneck problem of large-scale backup that inline, chunk-based deduplication schemes face.
+> originally, it needs to index every chunk.
+> **chunk-lookup disk bottleneck problem**
+
+2. It concerns the scenario of inline-deduplication
+> data is deduplicated as it arrives rather than later in batch mode.
+> the backup data is a form of data stream (e.g., 400GB image)
+
+3. This paper solves this problem via **chunk locality**.
+> 1. the tendency for chunks in backup data streams to reoccur together
+> 2. For example: if the last time it encountered chunk $A$, it was surrounded by chunks $B$, $C$, and $D$, then next time it encounters $A$ (even for different backup), it is likely that it will also encounter $B$, $C$ or $D$ nearby.
+
+### Spare Index
+- Key observation
+If two pieces of backup streams share any chunks, they are likely to share many chunks.
+
+
+- Inline deduplication vs. out-of-line deuplication
+1. Inline deduplication: the data is deuplicated as it arrives and before it hits disk.
+2. Out-of-line deduplication: the data is first accumulated in an on-disk holding area and then deduplicated later in batch mode.
+
+The disadvantages of out-of-line deduplication:
+>1. the need for an on-disk holding area large enough to hold an entire backup window's worth of raw data.
+>2. need to implement a separate holding area.
+>3. it is not possible to conserve network or disk bandwidth because every chunk must be written to the hold
+
+- Segmentation
+In its approach, a segment is the unit of storage and retrieval.
+> segment: a sequence of chunks, a few megabytes
+> two segments are similar: if they share a number of chunks.
+
+**Segment manifests**: segment recipe
+> allows reconstructing a segment from its chunks.
+> every stored segment has a manifest that is stored on disk.
+
+Segmentation method:
+> 1. Fixed-sized segments
+> 2. Variable-size segmentation: use the same trick at the chunking level to avoid the boundary-shifting problem, base the landmarks in the content.
+
+- Deduplication step
+Step 1: identify among all the segments in the store some that are most similar to the incoming segment
+> this paper calls them "champions"
+
+Step 2: deduplicate against those "champions" segments by finding the chunks they share with the incoming segment, which do not need to be stored again.
+
+
+
+
+- How to identify the similar segments
+sample the chunk hashes of the incoming segment, and use an in-RAM index to determine which already-stored segments contain how many of those hashes.
+> **sparse index**: maps the samples to the manifests in which they occur. (Need to set the limitation for the number of manifests that can be pointed by any one hook)
+> once it has chosen champions, it can load their manifests into RAM and use them to deduplicate the incoming segment.
+> Assuptions: chunk locality they are likely to share many other chunks with the incoming segment as well.
+> a **score scheme** to determine the champions (choose the manifest with highest non-zero score)
+
+- Deduplicating against the champions
+1. load the champion manifests from disk to the RAM
+> can use a small cache to speed this process since the adjacent segments sometimes have champions in common.
+
+### Implementation and Evaluation
+- Evaluation
+1. simulator based evaluation
+> input: a series of (chunk hash, length) pairs
+> divide into some segments, determines the champions for each segment, and then calculates the amount of deduplication obtained.
+
+2. setting
+chunk size: 4KB mean, which they find a good trade-off between maximizing deduplication and minimizing per-chunk overhead.
+3. dataset
+> Workgroup: a semi-regular series of backups of the desktop PCs of a group of 20 engineers. (3TB)
+> SMB: medium business server backedup to virtual tape, 0.6TB.
+
+4. Data Locality
+> Deduplicating each input segment against only 2 prior segments can sufficeto remove all but 1% of the duplicate chunks.
+> Larger segment size yield slightly less locality
+
+5. RAM Space usage
+Suppose the scenario of 100TB backup data, and compare the RAM usage with bloom filter.
+
+
+## 2. Strength (Contributions of the paper)
+1. the key strength of this paper is only maintaining the sparse index in RAM, which is much smaller than a full chunk index.
+> can be 128 times smaller than a full chunk index.
+> amortize the disk seek overhead of the thousands of chunks in each segment.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. this paper mentions the concept of chunk locality, and takes advantages of this property to adopt the segment level deduplication. But it does not mention how to measure chunk locality for a given workload?
+> without knowing the chunk locality, it is hard to decide the segment.
+
+
+2. In this paper, it uses a manifest to store the metadata of the segment, and put them in the disk.
+
+3. In this paper, it does not consider the client-side deduplication, the reason this paper argues
+> it needs the cost of some client-side processing
+> need to modify the legacy backup clients
diff --git a/StoragePaperNote/Deduplication/Deduplication-System-Design/dedupv1-MSST'10.md b/StoragePaperNote/Deduplication/Deduplication-System-Design/dedupv1-MSST'10.md
old mode 100644
new mode 100755
index a39f110..6a57735
--- a/StoragePaperNote/Deduplication/Deduplication-System-Design/dedupv1-MSST'10.md
+++ b/StoragePaperNote/Deduplication/Deduplication-System-Design/dedupv1-MSST'10.md
@@ -1,91 +1,91 @@
----
-typora-copy-images-to: ../paper_figure
----
-dedupv1: Improving Deduplication Throughput using Solid State Drives (SSD)
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| MSST'10 | Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The limited random IO performance of hard disks limits the overall throughput of deduplication system
-> if the index does not fit into main memory
-
-The size of the chunk index limits the usable capacity of the deduplication system.
-> 8KB chunk size, 20 byte fingerprint: 1TB data $\Rightarrow$ 2.5GB chunk index
-> It becomes necessary to store the fingerprint index on disk.
-
-This paper wants to use solid-state drives (SSDs) to improve its throughput compared to disk-based systems.
-> However, most current SSDs suffer from slow random writes. Its design proposes to avoid random writes on the critical data path.
-> The weak point of SSDs is the limited number of random writes.
-
-### dedupv1
-- System architecture
-
-The data deduplication is transparent to the user of the SCSI target.
-1. Chunking and Fingerprint
-Each chunk is fingerprinted after the chunking process using a cryptographic hash function.
-
-2. Filter chain state
-The file chain can execute a serirs of filters
-> after each filter step, the result of the filter determines which filter steps are executed afterwards.
-
-**EXISTING**: the current chunk is an exact duplicate
-**STRONG-MAYBE**: there is a very high probability that the current chunk is a duplicate
-> a typical result after a fingerprint comparison.
-
-**WEAK-MAYBE**: the filter cannot make ant statement about the duplication state of the chunk.
-**NON-EXISTING**: the filter rules out the possibility that the chunk is already known.
-
-- Filter implementations
-1. Chunk index filter
-2. Block index filter
-3. Byte compare filter
-4. Bloom filter
-5. Container cache filter
-
-
-
-- Chunk index
-Auxiliary chunk index stores chunk entries for all chunks whose containers are not yet written to disk as such chunk entires are only allowed to be stored persistently after the chunk data is committed.
-> take index writes out of the critical path
-> if the auxiliary index grows beyond a certain limit or if the system is idle, a background thread moves chunk metadata from the auxiliary index to the persistent index.
-
-
-- Block index
-It stores the metadata which is necessary to map a block of the iSCSI device to the chunks of varying length that from the most version of the block data.
-> a block mapping contains multiple full chunks.
-
-
-
-
-- Log
-Two goals
-1. recover from system crashes: just replay
-2. delay write operations
-Also helps to delay may write operations
-> the amount of IO operations in the critical path is minimized because the log assures that the delayed operations can be recovered.
-
-
-### Implementation and Evaluation
-- Evaluation
-1. distinguish the first backup generation and further backup generation
-> Generate first backup generation: 128GB subset of files
-> Generate second backupo generation based on the first backup generation
-
-2. evaluate the throughput using different index storage systems
-
-## 2. Strength (Contributions of the paper)
-1. the key novelty of this paper is it uses the auxiliary chunk index to store chunk entires for chunks whose containers are not yet written to disk.
-> take index writes out of the critical path.
-> delay the IO such that the update operations can be done **outside the critical path**.
-
-
-## 3. Weakness (Limitations of the paper)
-1. this idea of this paper is not novel.
-
-## 4. Future Works
-1. This paper shows that it is possible to build a deduplication system using SSDs that can provide a higher performance.
+---
+typora-copy-images-to: ../paper_figure
+---
+dedupv1: Improving Deduplication Throughput using Solid State Drives (SSD)
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| MSST'10 | Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The limited random IO performance of hard disks limits the overall throughput of deduplication system
+> if the index does not fit into main memory
+
+The size of the chunk index limits the usable capacity of the deduplication system.
+> 8KB chunk size, 20 byte fingerprint: 1TB data $\Rightarrow$ 2.5GB chunk index
+> It becomes necessary to store the fingerprint index on disk.
+
+This paper wants to use solid-state drives (SSDs) to improve its throughput compared to disk-based systems.
+> However, most current SSDs suffer from slow random writes. Its design proposes to avoid random writes on the critical data path.
+> The weak point of SSDs is the limited number of random writes.
+
+### dedupv1
+- System architecture
+
+The data deduplication is transparent to the user of the SCSI target.
+1. Chunking and Fingerprint
+Each chunk is fingerprinted after the chunking process using a cryptographic hash function.
+
+2. Filter chain state
+The file chain can execute a serirs of filters
+> after each filter step, the result of the filter determines which filter steps are executed afterwards.
+
+**EXISTING**: the current chunk is an exact duplicate
+**STRONG-MAYBE**: there is a very high probability that the current chunk is a duplicate
+> a typical result after a fingerprint comparison.
+
+**WEAK-MAYBE**: the filter cannot make ant statement about the duplication state of the chunk.
+**NON-EXISTING**: the filter rules out the possibility that the chunk is already known.
+
+- Filter implementations
+1. Chunk index filter
+2. Block index filter
+3. Byte compare filter
+4. Bloom filter
+5. Container cache filter
+
+
+
+- Chunk index
+Auxiliary chunk index stores chunk entries for all chunks whose containers are not yet written to disk as such chunk entires are only allowed to be stored persistently after the chunk data is committed.
+> take index writes out of the critical path
+> if the auxiliary index grows beyond a certain limit or if the system is idle, a background thread moves chunk metadata from the auxiliary index to the persistent index.
+
+
+- Block index
+It stores the metadata which is necessary to map a block of the iSCSI device to the chunks of varying length that from the most version of the block data.
+> a block mapping contains multiple full chunks.
+
+
+
+
+- Log
+Two goals
+1. recover from system crashes: just replay
+2. delay write operations
+Also helps to delay may write operations
+> the amount of IO operations in the critical path is minimized because the log assures that the delayed operations can be recovered.
+
+
+### Implementation and Evaluation
+- Evaluation
+1. distinguish the first backup generation and further backup generation
+> Generate first backup generation: 128GB subset of files
+> Generate second backupo generation based on the first backup generation
+
+2. evaluate the throughput using different index storage systems
+
+## 2. Strength (Contributions of the paper)
+1. the key novelty of this paper is it uses the auxiliary chunk index to store chunk entires for chunks whose containers are not yet written to disk.
+> take index writes out of the critical path.
+> delay the IO such that the update operations can be done **outside the critical path**.
+
+
+## 3. Weakness (Limitations of the paper)
+1. this idea of this paper is not novel.
+
+## 4. Future Works
+1. This paper shows that it is possible to build a deduplication system using SSDs that can provide a higher performance.
2. A key insight in this paper is it leverages a filter chain to find the exact duplicate chunks, instead of directly using the "compare-by-hash"
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Distributed-Dedup/ClusterSingle-ToS'18.md b/StoragePaperNote/Deduplication/Distributed-Dedup/ClusterSingle-ToS'18.md
old mode 100644
new mode 100755
index 6035f09..efea172
--- a/StoragePaperNote/Deduplication/Distributed-Dedup/ClusterSingle-ToS'18.md
+++ b/StoragePaperNote/Deduplication/Distributed-Dedup/ClusterSingle-ToS'18.md
@@ -1,111 +1,111 @@
----
-typora-copy-images-to: ../paper_figure
----
-Cluster and Single-Node Analysis of Long-Term Deduplication Patterns
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ToS'18 | Distributed Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
- - Most past studies either were of a small static snapshot or covered only a short period that was **not representative** of how a backup system evolves over time.
- - By understanding such datasets' characteristics, it can design more efficient storage systems.
- - Single-node deduplication vs. Cluster deduplication
- - can distribute the load and aggregating throughput from multiple nodes.
-
-### Cluster and Single-node analysis
-- Stateless vs. stateful
- - stateless strategy is designed for *implementation simplicity*
- - stateful strategy checks the similarity of the super-chunk in all nodes
-
-- How to simulate the incremental backup?
-
-need to detect newly added and modified files.
-> comparing two consecutive snapshot, it can identify whether a file was newly added
-> checking the *mtime* (modified time)
-
-- How to estimate the metadata overhead?
-
-Follow the work of FAST'12
-$f$ is the size of each chunk's metadata divided by the chunk size
-$L$ is the size before deduplication
-$P$ is the raw data size afterwards
-
-$f \times (L+P)$, $f \times L$ is the size of a **file's recipe**, $f \times P$ is the size of the **hash index**
-
-- The benefit of larger chunk sizes
- - With higher deduplication ratios, metadata can become a **large fraction** of post-deduplication space.
-> larger chunk sizes reduce the number of metadata entries.
-> reduce the amount of metadata also reduces the number of I/Os to the chunk index.
-
-
-- Chunk popularity
- - the number of duplicate occurrences of a chunk
-> the skew in chunk popularity has also been found in primary storage and HPC systems
-> Identifying such popular chunks would be useful in optimizing performance.
->
-> > accelerate chunk indexing and improve cache hit ratios
-
-- Analysis of groups of users
- - In a cluster deduplication system, grouping similar users into one storage node would improve the overall deduplication ratio
- - A given chunk is shared between two or more users, it is also more likely to appear several times in any single user's data.
-
-- Analysis of cluster deduplication
- - Two challenges:
- - Maintaining high deduplication ratio (each node performs individually and no information about further duplicates is exchanged among them)
- - Balancing the load across cluster nodes: trade-off between load balance and deduplication ratios
- - different design principles: deduplication ratio, load distribution, and throughput
-
-- Chunk-level routing
-
-Chunk-level routing is required to achieve exact deduplication
-> resulting in more CPU and memory consumption.
-
-- Key Metrics for cluster-deduplication
- - Cluster deduplication ratio
- - Load-Distribution
- - Communication overhead
-
-- Load distribution
- - Physical load distributions: the capacity usage at the nodes
- - Logical load distributions: I/O performance
- - the performance of load balance is the **Coefficient of Variation**
- - the ratio of the standard deviation to the mean
- - Stateless algorithm leads to high data skew in terms of the logical load distribution, which is opposite to their performance in terms of the logical load distribution.
- - Stateful approach incurs the most communication overhead, since it needs to send information to all storage nodes to request its similarity index. Using a master node may decrease the communication overhead, but it needs to store all the Bloom filters in the master node and the master node might become a bottleneck as the system scales up.
- - Stateless causes lower communication overhead since the client can choose the destination node without sending any messages.
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-1. classify data-routing algorithms and implement seven published algorithms that adopt different strategies
-> provide a detailed comparative analysis.
-
-2. study a locally collected dataset that spans a period of 2.5 years (4000 daily user snapshot)
-
-## 3. Weakness (Limitations of the paper)
-1. Does not consider aging and fragmentation effects in this trace
-2. Does not consider the restore performance, especially for deduplication clusters.
-
-
-## 4. Some Insights (Future work)
-1. Because of the size of the chunk index itself, smaller chunk sizes are not always better at saving space
-> the impact of chunk sizes
-
-2. Even similar users behave quite differently, and this should be taken into account in future deduplication systems.
-
-3. In distributed deduplication, the routing algorithm that can achieve a good physical load balance may lead to a huge skew in their logical distribution.
-
-4. Deduplication trace
-SYSTOR'09: Virtual machine disk images
-ATC'11: Microsoft primary storage: MS 857 snapshots over 4 weeks. (**primary storage**)
-FAST'12: EMC's Data Domain backup system (**backup**)
-SYSTOR'12: file analysis, file type and high redundancy
-SC'12: HPC trace
-
-5. Some insights
-Routing by file types can generally achieve a better deduplication ratio than other schemes
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Cluster and Single-Node Analysis of Long-Term Deduplication Patterns
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ToS'18 | Distributed Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - Most past studies either were of a small static snapshot or covered only a short period that was **not representative** of how a backup system evolves over time.
+ - By understanding such datasets' characteristics, it can design more efficient storage systems.
+ - Single-node deduplication vs. Cluster deduplication
+ - can distribute the load and aggregating throughput from multiple nodes.
+
+### Cluster and Single-node analysis
+- Stateless vs. stateful
+ - stateless strategy is designed for *implementation simplicity*
+ - stateful strategy checks the similarity of the super-chunk in all nodes
+
+- How to simulate the incremental backup?
+
+need to detect newly added and modified files.
+> comparing two consecutive snapshot, it can identify whether a file was newly added
+> checking the *mtime* (modified time)
+
+- How to estimate the metadata overhead?
+
+Follow the work of FAST'12
+$f$ is the size of each chunk's metadata divided by the chunk size
+$L$ is the size before deduplication
+$P$ is the raw data size afterwards
+
+$f \times (L+P)$, $f \times L$ is the size of a **file's recipe**, $f \times P$ is the size of the **hash index**
+
+- The benefit of larger chunk sizes
+ - With higher deduplication ratios, metadata can become a **large fraction** of post-deduplication space.
+> larger chunk sizes reduce the number of metadata entries.
+> reduce the amount of metadata also reduces the number of I/Os to the chunk index.
+
+
+- Chunk popularity
+ - the number of duplicate occurrences of a chunk
+> the skew in chunk popularity has also been found in primary storage and HPC systems
+> Identifying such popular chunks would be useful in optimizing performance.
+>
+> > accelerate chunk indexing and improve cache hit ratios
+
+- Analysis of groups of users
+ - In a cluster deduplication system, grouping similar users into one storage node would improve the overall deduplication ratio
+ - A given chunk is shared between two or more users, it is also more likely to appear several times in any single user's data.
+
+- Analysis of cluster deduplication
+ - Two challenges:
+ - Maintaining high deduplication ratio (each node performs individually and no information about further duplicates is exchanged among them)
+ - Balancing the load across cluster nodes: trade-off between load balance and deduplication ratios
+ - different design principles: deduplication ratio, load distribution, and throughput
+
+- Chunk-level routing
+
+Chunk-level routing is required to achieve exact deduplication
+> resulting in more CPU and memory consumption.
+
+- Key Metrics for cluster-deduplication
+ - Cluster deduplication ratio
+ - Load-Distribution
+ - Communication overhead
+
+- Load distribution
+ - Physical load distributions: the capacity usage at the nodes
+ - Logical load distributions: I/O performance
+ - the performance of load balance is the **Coefficient of Variation**
+ - the ratio of the standard deviation to the mean
+ - Stateless algorithm leads to high data skew in terms of the logical load distribution, which is opposite to their performance in terms of the logical load distribution.
+ - Stateful approach incurs the most communication overhead, since it needs to send information to all storage nodes to request its similarity index. Using a master node may decrease the communication overhead, but it needs to store all the Bloom filters in the master node and the master node might become a bottleneck as the system scales up.
+ - Stateless causes lower communication overhead since the client can choose the destination node without sending any messages.
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+1. classify data-routing algorithms and implement seven published algorithms that adopt different strategies
+> provide a detailed comparative analysis.
+
+2. study a locally collected dataset that spans a period of 2.5 years (4000 daily user snapshot)
+
+## 3. Weakness (Limitations of the paper)
+1. Does not consider aging and fragmentation effects in this trace
+2. Does not consider the restore performance, especially for deduplication clusters.
+
+
+## 4. Some Insights (Future work)
+1. Because of the size of the chunk index itself, smaller chunk sizes are not always better at saving space
+> the impact of chunk sizes
+
+2. Even similar users behave quite differently, and this should be taken into account in future deduplication systems.
+
+3. In distributed deduplication, the routing algorithm that can achieve a good physical load balance may lead to a huge skew in their logical distribution.
+
+4. Deduplication trace
+SYSTOR'09: Virtual machine disk images
+ATC'11: Microsoft primary storage: MS 857 snapshots over 4 weeks. (**primary storage**)
+FAST'12: EMC's Data Domain backup system (**backup**)
+SYSTOR'12: file analysis, file type and high redundancy
+SC'12: HPC trace
+
+5. Some insights
+Routing by file types can generally achieve a better deduplication ratio than other schemes
+
diff --git a/StoragePaperNote/Deduplication/Distributed-Dedup/EDP-IWQoS'15.md b/StoragePaperNote/Deduplication/Distributed-Dedup/EDP-IWQoS'15.md
old mode 100644
new mode 100755
index ddc9814..d5d3898
--- a/StoragePaperNote/Deduplication/Distributed-Dedup/EDP-IWQoS'15.md
+++ b/StoragePaperNote/Deduplication/Distributed-Dedup/EDP-IWQoS'15.md
@@ -1,106 +1,106 @@
----
-typora-copy-images-to: ../paper_figure
----
-Even Data Placement for Load Balance in Reliable Distributed Deduplication Storage Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| IWQoS'15 | Distributed Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- This paper considers the scenario of distributed deduplication
- - deduplication for storage efficiency
- - erasure coding for reliability
-
-- It targets the problem of load balance in the above scenario
- - read balance
- - storage balance
-
-- Motivation
-Conventional data placement cannot **simultaneously** achieve both read balance and storage balance objectives.
-> the trade-off between read balance and storage balance.
-
-Solve this problem by formulates a combinatorial optimization problem..
-> Even Data Placement (EDP) algorithm
-
-### EDP
-- Root cause:
-deduplication causes duplicate data chunks to refer to existing identical data chunks that a re **scattered across different nodes** in an unpredictable way.
-> file chunks may be clustered in a small number of nodes. $\Rightarrow$ poor read performance.
-
-- Integration (Deduplication + Erasure Coding)
-1. apply deduplication to remove duplicate chunks
-2. divide data into non-overlapping groups of $k$ unique chunks (as a stripe), then encode the $k$ data chunks to form additional $n-k$ parity chunks.
-> padding: when use variable-size chunking. (only store the non-padded chunks)
-
-- Baseline policy
-1. round-robin fashion: the number of storage nodes is equal to the erasure coding stripe size.
-> with deduplication: **inherently leads to uneven data distribution**.
-> breaking the connection between read balance and storage balance. (**parallel I/O accesses**)
-
-
-
-- Problem Model
-1. Per-batch basis
-Only consider the how to place the unique chunks to maximally achieve balanced read.
-> minimize the sum of the read balance gaps for all files.
-> has a huge solution space for this optimization problem.
-
-2. Heterogeneity Awareness
-Modern distributed storage systems are often composed of heterogeneous nodes , so the read latencies of data chunks differ across nodes.
-> introducing a weight $w_j$ for each node $j$.
-
-- Main Algorithm (based on greedy algorithm)
- - Goal: aim to efficiently identify a near-optimal data placement solution.
- - Two steps: Distribute and Swap.
- > Swap: attempts to swap the chunk positions of different file pairs to see if the summation of the read balance gaps can be further reduced.
-
-- Some extensions
- - Heterogeneity environment
- - Variable-size chunking: consider the number of bytes rather than number of chunks.
-
-- System Architecture
-
-
-> Metadata server runs the data placement policy, then responds to the client with the list of unique chunks and how they placed across nodes.
-> Clients then applies erasure coding to the unique chunks, and writes the encoded chunks to different nodes in parallel.
-> Metadata server: maintain metadata in key-value databases and implement them using **Kyoto Cabinet library**.
-
-
-### Implementation and Evaluation
-- Evaluation
- - Dataset: FSL-home, Linux kernel source code (unpacked), Linux kernel source code (tarball file)
- - filter small files of size less than 16KB
-
-- Simulation experiments
-1. Effectiveness of EDP
-2. Impact of erasure coding
-3. Impact of heterogeneity
-> varying I/O bandwidth across nodes
-
-- Testbed experiments
-1. the normalized read latency
-2. Impact of chunking schemes: fixed-size and variable-size chunking schemes
-3. Read latency distribution:
-4. Computational overhead: varying the number of files in each batch.
-
-## 2. Strength (Contributions of the paper)
-1. formulating an optimization problem for read balance and storage balance.
-> extend this optimization problem to **heterogeneous** environments.
-> EDP algorithm, greedy algorithm.
-
-2. implement a distributed storage system prototype
-> combining deduplication and erasure coding
-> extensive simulation and testbed experiments under real-world workloads.
-
-
-## 3. Weakness (Limitations of the paper)
-1. The overhead of this algorithm is high. Although EDP is in polynomial time, it needs to buffer each batch of unique chunks and wait the schedule result. I deem it would harm the performance dramatically.
-> near-optimal?
-
-## 4. Some Insights (Future work)
-1. this paper assumes that the network transmission is the bottleneck, so it needs to leverage the I/O parallel to guarantee the performance. If this assumption cannot keep, this problem may because unnecessary.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Even Data Placement for Load Balance in Reliable Distributed Deduplication Storage Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| IWQoS'15 | Distributed Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- This paper considers the scenario of distributed deduplication
+ - deduplication for storage efficiency
+ - erasure coding for reliability
+
+- It targets the problem of load balance in the above scenario
+ - read balance
+ - storage balance
+
+- Motivation
+Conventional data placement cannot **simultaneously** achieve both read balance and storage balance objectives.
+> the trade-off between read balance and storage balance.
+
+Solve this problem by formulates a combinatorial optimization problem..
+> Even Data Placement (EDP) algorithm
+
+### EDP
+- Root cause:
+deduplication causes duplicate data chunks to refer to existing identical data chunks that a re **scattered across different nodes** in an unpredictable way.
+> file chunks may be clustered in a small number of nodes. $\Rightarrow$ poor read performance.
+
+- Integration (Deduplication + Erasure Coding)
+1. apply deduplication to remove duplicate chunks
+2. divide data into non-overlapping groups of $k$ unique chunks (as a stripe), then encode the $k$ data chunks to form additional $n-k$ parity chunks.
+> padding: when use variable-size chunking. (only store the non-padded chunks)
+
+- Baseline policy
+1. round-robin fashion: the number of storage nodes is equal to the erasure coding stripe size.
+> with deduplication: **inherently leads to uneven data distribution**.
+> breaking the connection between read balance and storage balance. (**parallel I/O accesses**)
+
+
+
+- Problem Model
+1. Per-batch basis
+Only consider the how to place the unique chunks to maximally achieve balanced read.
+> minimize the sum of the read balance gaps for all files.
+> has a huge solution space for this optimization problem.
+
+2. Heterogeneity Awareness
+Modern distributed storage systems are often composed of heterogeneous nodes , so the read latencies of data chunks differ across nodes.
+> introducing a weight $w_j$ for each node $j$.
+
+- Main Algorithm (based on greedy algorithm)
+ - Goal: aim to efficiently identify a near-optimal data placement solution.
+ - Two steps: Distribute and Swap.
+ > Swap: attempts to swap the chunk positions of different file pairs to see if the summation of the read balance gaps can be further reduced.
+
+- Some extensions
+ - Heterogeneity environment
+ - Variable-size chunking: consider the number of bytes rather than number of chunks.
+
+- System Architecture
+
+
+> Metadata server runs the data placement policy, then responds to the client with the list of unique chunks and how they placed across nodes.
+> Clients then applies erasure coding to the unique chunks, and writes the encoded chunks to different nodes in parallel.
+> Metadata server: maintain metadata in key-value databases and implement them using **Kyoto Cabinet library**.
+
+
+### Implementation and Evaluation
+- Evaluation
+ - Dataset: FSL-home, Linux kernel source code (unpacked), Linux kernel source code (tarball file)
+ - filter small files of size less than 16KB
+
+- Simulation experiments
+1. Effectiveness of EDP
+2. Impact of erasure coding
+3. Impact of heterogeneity
+> varying I/O bandwidth across nodes
+
+- Testbed experiments
+1. the normalized read latency
+2. Impact of chunking schemes: fixed-size and variable-size chunking schemes
+3. Read latency distribution:
+4. Computational overhead: varying the number of files in each batch.
+
+## 2. Strength (Contributions of the paper)
+1. formulating an optimization problem for read balance and storage balance.
+> extend this optimization problem to **heterogeneous** environments.
+> EDP algorithm, greedy algorithm.
+
+2. implement a distributed storage system prototype
+> combining deduplication and erasure coding
+> extensive simulation and testbed experiments under real-world workloads.
+
+
+## 3. Weakness (Limitations of the paper)
+1. The overhead of this algorithm is high. Although EDP is in polynomial time, it needs to buffer each batch of unique chunks and wait the schedule result. I deem it would harm the performance dramatically.
+> near-optimal?
+
+## 4. Some Insights (Future work)
+1. this paper assumes that the network transmission is the bottleneck, so it needs to leverage the I/O parallel to guarantee the performance. If this assumption cannot keep, this problem may because unnecessary.
+
2. A trade-off between read balance and storage balance.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Distributed-Dedup/GoSeed-FAST'20.md b/StoragePaperNote/Deduplication/Distributed-Dedup/GoSeed-FAST'20.md
old mode 100644
new mode 100755
index 28179df..6e02209
--- a/StoragePaperNote/Deduplication/Distributed-Dedup/GoSeed-FAST'20.md
+++ b/StoragePaperNote/Deduplication/Distributed-Dedup/GoSeed-FAST'20.md
@@ -1,111 +1,111 @@
----
-typora-copy-images-to: ../paper_figure
----
-GoSeed: Generating an Optimal Seeding Plan for Deduplicated Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'20 | Deduplication Migration |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-Data migration plans must consider the dependencies between files that are remapped to new volumes and files that are not.
-> data migration: moving a portion of a physical volume's data to another volume
-> typically performed for load balancing and resizing
-
-*Migration plan*: an efficient migration plan will free the required space on the source volume while minimizing the space occupied on the target.
-> Previous studies propose greedy algorithms for determining the set of migrated files
-> when migrating a file from one volume to another, this file's chunks that also belong to another file must be copied (deduplicated), rather than moved. (the chunks should reside on the same storage volume)
-
-
-This paper bridges this gap for seeding and, consequently, space reclamation problems.
-
-### GoSeed
-- Main goal:
-space reclamation problems, formulation of seeding as an integer linear programming (ILP) problem.
-> providing a theoretical framework for generating an optimal plan by minimizing the amount of data replicated. (NP-hard)
-
-- Challenges
-commercial optimizers can solve ILP with **hundreds of thousands ** of variables and constraints
-> (real-world case may consist of hundreds of millions of variables and constraints), may require long time to process these instances
-
-How to do the tradeoff between runtime and optimality?
-> make solving ILP more practical.
-
-- GoSeed ILP optimization
-1. Input:
-> $M$: the physical data size moved from one volume to another (can have a slack factor)
-> unique block set in the volume
-> the file set: the mapping relationship between files and blocks
-
-2. objective:
-$R$: the total size of the physical data that must be copied (replicated) as a result.
-minimizing $R$
-
-3. output: (Boolean variables)
-the list of files that are remapped
-the list of blocks that are replicated
-the list of blocks that are migrated from
-
-the complexity depends on the deduplication ratio and on the pattern of data sharing between the files
-> number of files and blocks
-
-
-- GoSeed acceleration methods
-The runtime for generating a migration plan for a source volume with several TBs of data would be unacceptably long.
-> need the method for reducing this generation time
-
-1. Solver timeout
-The runtime of an ILP solver can be limited by specifying a timeout value, return the feasible solution before the timeout value.
-> advantage: easy to process
-> downside: cannot tell how far the suboptimal solution is from the optimal one.
-
-2. Fingerprint sampling
-Using a sampling degree $k$, it includes in its sample all chunks whose fingperprint contains $k$ leading zeros, and all the files containing those chunks.
-> reduce the size of the ILP instance by a predictable factor.
-
-3. Container-based aggregation
-Formulate the migration problem with containers
-> coalesce chunks that are stored in the same container into a single block.
-
-### Implementation and Evaluation
-- Implementation
-Using **Gurobi optimizer** as its ILP solver (C++ interface to define its problem instance)
-> a wrapper program for converting a volume snapshot into an ILP instance in Gurobi
-
-- Evaluation
-Two datasets: UBC and FSL
-> treat a snapshot as a file in its scenario.
-
-Comparison to existing approaches:
-1. Rangoli
-2. SGreedy (Sketch Volume)
-
-Effect of ILP parameters
-Effect of solver timeout
-Effect of fingerprint sampling
-Efficiency of container-based plans
-
-## 2. Strength (Contributions of the paper)
-1. This paper formulates the seeding problem as a ILP problem, and proposes three methods to accelerate the solving speed.
-> solver timeout
-> fingerprint sampling
-> container aggregation
-
-
-## 3. Weakness (Limitations of the paper)
-1. Lack the part of how to use this ILP solver to build a real system
-
-
-## 4. Some Insights (Future work)
-1. This paper mentions providing these systems with a set of management functions is the next challenge in the design of deduplication systems.
-> capacity management in deduplicated systems
-> fast and effective data migration
-
-2. It also mentions that most ILP solvers are based on the Simplex algorithm
-> solves linear programming where the variables are not necessarily integers.
-> Then, search for an optimal integer solution, starting the search at the vicinity of the non-integer one.
-
-This also can support the algorithm in TED.
+---
+typora-copy-images-to: ../paper_figure
+---
+GoSeed: Generating an Optimal Seeding Plan for Deduplicated Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'20 | Deduplication Migration |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+Data migration plans must consider the dependencies between files that are remapped to new volumes and files that are not.
+> data migration: moving a portion of a physical volume's data to another volume
+> typically performed for load balancing and resizing
+
+*Migration plan*: an efficient migration plan will free the required space on the source volume while minimizing the space occupied on the target.
+> Previous studies propose greedy algorithms for determining the set of migrated files
+> when migrating a file from one volume to another, this file's chunks that also belong to another file must be copied (deduplicated), rather than moved. (the chunks should reside on the same storage volume)
+
+
+This paper bridges this gap for seeding and, consequently, space reclamation problems.
+
+### GoSeed
+- Main goal:
+space reclamation problems, formulation of seeding as an integer linear programming (ILP) problem.
+> providing a theoretical framework for generating an optimal plan by minimizing the amount of data replicated. (NP-hard)
+
+- Challenges
+commercial optimizers can solve ILP with **hundreds of thousands ** of variables and constraints
+> (real-world case may consist of hundreds of millions of variables and constraints), may require long time to process these instances
+
+How to do the tradeoff between runtime and optimality?
+> make solving ILP more practical.
+
+- GoSeed ILP optimization
+1. Input:
+> $M$: the physical data size moved from one volume to another (can have a slack factor)
+> unique block set in the volume
+> the file set: the mapping relationship between files and blocks
+
+2. objective:
+$R$: the total size of the physical data that must be copied (replicated) as a result.
+minimizing $R$
+
+3. output: (Boolean variables)
+the list of files that are remapped
+the list of blocks that are replicated
+the list of blocks that are migrated from
+
+the complexity depends on the deduplication ratio and on the pattern of data sharing between the files
+> number of files and blocks
+
+
+- GoSeed acceleration methods
+The runtime for generating a migration plan for a source volume with several TBs of data would be unacceptably long.
+> need the method for reducing this generation time
+
+1. Solver timeout
+The runtime of an ILP solver can be limited by specifying a timeout value, return the feasible solution before the timeout value.
+> advantage: easy to process
+> downside: cannot tell how far the suboptimal solution is from the optimal one.
+
+2. Fingerprint sampling
+Using a sampling degree $k$, it includes in its sample all chunks whose fingperprint contains $k$ leading zeros, and all the files containing those chunks.
+> reduce the size of the ILP instance by a predictable factor.
+
+3. Container-based aggregation
+Formulate the migration problem with containers
+> coalesce chunks that are stored in the same container into a single block.
+
+### Implementation and Evaluation
+- Implementation
+Using **Gurobi optimizer** as its ILP solver (C++ interface to define its problem instance)
+> a wrapper program for converting a volume snapshot into an ILP instance in Gurobi
+
+- Evaluation
+Two datasets: UBC and FSL
+> treat a snapshot as a file in its scenario.
+
+Comparison to existing approaches:
+1. Rangoli
+2. SGreedy (Sketch Volume)
+
+Effect of ILP parameters
+Effect of solver timeout
+Effect of fingerprint sampling
+Efficiency of container-based plans
+
+## 2. Strength (Contributions of the paper)
+1. This paper formulates the seeding problem as a ILP problem, and proposes three methods to accelerate the solving speed.
+> solver timeout
+> fingerprint sampling
+> container aggregation
+
+
+## 3. Weakness (Limitations of the paper)
+1. Lack the part of how to use this ILP solver to build a real system
+
+
+## 4. Some Insights (Future work)
+1. This paper mentions providing these systems with a set of management functions is the next challenge in the design of deduplication systems.
+> capacity management in deduplicated systems
+> fast and effective data migration
+
+2. It also mentions that most ILP solvers are based on the Simplex algorithm
+> solves linear programming where the variables are not necessarily integers.
+> Then, search for an optimal integer solution, starting the search at the vicinity of the non-integer one.
+
+This also can support the algorithm in TED.
diff --git a/StoragePaperNote/Deduplication/Distributed-Dedup/Produck-SoCC'12.md b/StoragePaperNote/Deduplication/Distributed-Dedup/Produck-SoCC'12.md
old mode 100644
new mode 100755
index c2a147e..2bdc945
--- a/StoragePaperNote/Deduplication/Distributed-Dedup/Produck-SoCC'12.md
+++ b/StoragePaperNote/Deduplication/Distributed-Dedup/Produck-SoCC'12.md
@@ -1,123 +1,123 @@
----
-typora-copy-images-to: ../paper_figure
----
-Probabilistic Deduplication for Cluster-Based Storage Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SoCC'12 | Distributed Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Integrating deduplication into cluster-based backup solution is a challenging task.
- - maximizing deduplication
- - balancing the storage load on the available nodes
-
-- Countermeasure
-> 1. **stateless solutions**: assign data to nodes based only on the observation of the **content of the chunk** of data being stored.
-> 2. **stateful solutions**: maintain information about currently stored data in order to assign identical data blocks to the same node.
-
-- Key question
-The overhead of **stateful solutions** is very high which makes them impractical. How to make practical stateful system becomes available.
-
-### ProDuck
-
-- Architecture
- - Coordinator node: it is responsible for managing the requests of clients, determining which StorageNode should store each chunk of a given file.
- - Coordinator implements **novel chunk assignment** and **load balance strategies**
-
-
-
-
-- Chunking
- - content-based chunking
- - Group contiguous chunks into large superchunks when storing data onto storage nodes. (content-defined, checking the fingerprint of the last chunk)
- - average superchunk size of 15 * 1024 chunks
-
-- Chunk Assignment
-1. Assign superchunks to StorageNode so as to maximize deduplication. This assignment is carried out by the Coordinator through a novel chunk-assignment protocol.
-> 1. key observation: to choose the StorageNode offering the highest deduplication rate for a given **superchunk**, the protocol does not need to know exactly which chunks are stored on each StorageNode.
-> 2. it only needs the overlap between the chunks on each StorageNode and those in the superchunk to be stored.
-
-2. Probabilistic multiset intersection
-ProDuck uses PCSA to estimate the cardinality of a multiset. And this can extend to estimate the cardinality of the union of two multisets.
-> simply applying the bitwise OR operator on the two corresponding **bitmap vectors**.
-$$
-BITMAP|A \cup B| = BITMAP[A] | BITMAP[B]
-$$
-$$
-|A \cap B| = |A| + |B| - |A \cup B|
-$$
-
-3. Maintaining chunk information
-multiset:
-> 1. each superchunk.
-> 2. multiple superchunks stored in StorageNode.
-
-The coordinator stores a bitmap vector for each StorageNode
-> 64KB for each StorageNode
-> minimal space requirements
-
-Update when receiving a bitmap vector of the incoming superchunk.
-
-4. Choosing the best storage node
-calculating the overlap between the node's content and a superchunk.
-> update the corresponding the bitmap of the determined storage node.
-
-Also consider the data locality in backup workload:
-> rank the top $k$ StorageNode according to their overlaps, check if the node that stored the previous superchunk in the file is among these $k$.
-
-
-- Load Balancing
-1. Using bucket-based load-balancing strategy
-> splitting the storage space of each node into fixed-size buckets.
-
-The coordinator grants the StorageNode a new bucket only if doing so would not exceed the number of buckets allocated of the least loaded node by more than one.
-
-
-### Implementation and Evaluation
-- Evaluation
- - Dataset: *Wikipedia* snapshot (publicly available for download), private dataset.
- - Baseline: BloomFilter (stateful), MinHash (stateless),
-
-1. BloomFilter (stateful)
-the Coordinator maintains a bloom filter for each storage node an index of the node's contents.
-> the decision of storing a new superchunk takes into account the location of existing ones.
-
-2. MinHash (stateless)
-select the minimum hash of the chunks in a superchunk as the superchunk's signature.
-
-- Metrics
- - **Total Deduplication**: the original size of the dataset and its size after being deduplicated. (compare with the optimal case of a single-node system with no load-balancing constraints)
- - **Data Skew**: the occupied storage space on the most loaded node to the average load in the system
- - **Assignment Time**: the time of each strategy to assign all the dataset to the available nodes.
-
-3. varying the node number of the cluster
-
-4. Sensitivity analysis
-> 1. superchunk size
-> 2. bucket size
-> 3. maximum allowed bucket difference
-
-
-## 2. Strength (Contributions of the paper)
-1. this paper proposes a lightweight probabilistic node-assignment mechanism
-> quickly identify the servers that can provide the highest deduplication rates for a given data chunks.
-
-2. a new bucket-based load-balancing strategy
-> spreads the load equally among the nodes.
-
-## 3. Weakness (Limitations of the paper)
-1. the rationale of its bucket-based load-balancing algorithm is not very clear. I do not understand why it can outperform than it counterparts.
-
-2. Also, there are some system parameters which need to be configured and may impact the result. For example, the superchunk size. How to automated configure those parameters for different workloads is still unclear.
-
-
-## 4. Some Insights (Future work)
-1. In this work, the two key issues are
-> 1) how to determine the storage node for each incoming superchunk to achieve better deduplication ratio?
-> 2) how to achieve the load-balancing under the above objective?
-
-For the first issue, the key is have to design a data structure to track the content of each storage node.
-> probabilistic counting, set intersection
+---
+typora-copy-images-to: ../paper_figure
+---
+Probabilistic Deduplication for Cluster-Based Storage Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| SoCC'12 | Distributed Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Integrating deduplication into cluster-based backup solution is a challenging task.
+ - maximizing deduplication
+ - balancing the storage load on the available nodes
+
+- Countermeasure
+> 1. **stateless solutions**: assign data to nodes based only on the observation of the **content of the chunk** of data being stored.
+> 2. **stateful solutions**: maintain information about currently stored data in order to assign identical data blocks to the same node.
+
+- Key question
+The overhead of **stateful solutions** is very high which makes them impractical. How to make practical stateful system becomes available.
+
+### ProDuck
+
+- Architecture
+ - Coordinator node: it is responsible for managing the requests of clients, determining which StorageNode should store each chunk of a given file.
+ - Coordinator implements **novel chunk assignment** and **load balance strategies**
+
+
+
+
+- Chunking
+ - content-based chunking
+ - Group contiguous chunks into large superchunks when storing data onto storage nodes. (content-defined, checking the fingerprint of the last chunk)
+ - average superchunk size of 15 * 1024 chunks
+
+- Chunk Assignment
+1. Assign superchunks to StorageNode so as to maximize deduplication. This assignment is carried out by the Coordinator through a novel chunk-assignment protocol.
+> 1. key observation: to choose the StorageNode offering the highest deduplication rate for a given **superchunk**, the protocol does not need to know exactly which chunks are stored on each StorageNode.
+> 2. it only needs the overlap between the chunks on each StorageNode and those in the superchunk to be stored.
+
+2. Probabilistic multiset intersection
+ProDuck uses PCSA to estimate the cardinality of a multiset. And this can extend to estimate the cardinality of the union of two multisets.
+> simply applying the bitwise OR operator on the two corresponding **bitmap vectors**.
+$$
+BITMAP|A \cup B| = BITMAP[A] | BITMAP[B]
+$$
+$$
+|A \cap B| = |A| + |B| - |A \cup B|
+$$
+
+3. Maintaining chunk information
+multiset:
+> 1. each superchunk.
+> 2. multiple superchunks stored in StorageNode.
+
+The coordinator stores a bitmap vector for each StorageNode
+> 64KB for each StorageNode
+> minimal space requirements
+
+Update when receiving a bitmap vector of the incoming superchunk.
+
+4. Choosing the best storage node
+calculating the overlap between the node's content and a superchunk.
+> update the corresponding the bitmap of the determined storage node.
+
+Also consider the data locality in backup workload:
+> rank the top $k$ StorageNode according to their overlaps, check if the node that stored the previous superchunk in the file is among these $k$.
+
+
+- Load Balancing
+1. Using bucket-based load-balancing strategy
+> splitting the storage space of each node into fixed-size buckets.
+
+The coordinator grants the StorageNode a new bucket only if doing so would not exceed the number of buckets allocated of the least loaded node by more than one.
+
+
+### Implementation and Evaluation
+- Evaluation
+ - Dataset: *Wikipedia* snapshot (publicly available for download), private dataset.
+ - Baseline: BloomFilter (stateful), MinHash (stateless),
+
+1. BloomFilter (stateful)
+the Coordinator maintains a bloom filter for each storage node an index of the node's contents.
+> the decision of storing a new superchunk takes into account the location of existing ones.
+
+2. MinHash (stateless)
+select the minimum hash of the chunks in a superchunk as the superchunk's signature.
+
+- Metrics
+ - **Total Deduplication**: the original size of the dataset and its size after being deduplicated. (compare with the optimal case of a single-node system with no load-balancing constraints)
+ - **Data Skew**: the occupied storage space on the most loaded node to the average load in the system
+ - **Assignment Time**: the time of each strategy to assign all the dataset to the available nodes.
+
+3. varying the node number of the cluster
+
+4. Sensitivity analysis
+> 1. superchunk size
+> 2. bucket size
+> 3. maximum allowed bucket difference
+
+
+## 2. Strength (Contributions of the paper)
+1. this paper proposes a lightweight probabilistic node-assignment mechanism
+> quickly identify the servers that can provide the highest deduplication rates for a given data chunks.
+
+2. a new bucket-based load-balancing strategy
+> spreads the load equally among the nodes.
+
+## 3. Weakness (Limitations of the paper)
+1. the rationale of its bucket-based load-balancing algorithm is not very clear. I do not understand why it can outperform than it counterparts.
+
+2. Also, there are some system parameters which need to be configured and may impact the result. For example, the superchunk size. How to automated configure those parameters for different workloads is still unclear.
+
+
+## 4. Some Insights (Future work)
+1. In this work, the two key issues are
+> 1) how to determine the storage node for each incoming superchunk to achieve better deduplication ratio?
+> 2) how to achieve the load-balancing under the above objective?
+
+For the first issue, the key is have to design a data structure to track the content of each storage node.
+> probabilistic counting, set intersection
diff --git a/StoragePaperNote/Deduplication/Distributed-Dedup/TradeoffDataRouting-FAST'11.md b/StoragePaperNote/Deduplication/Distributed-Dedup/TradeoffDataRouting-FAST'11.md
old mode 100644
new mode 100755
index 63719af..b17efd9
--- a/StoragePaperNote/Deduplication/Distributed-Dedup/TradeoffDataRouting-FAST'11.md
+++ b/StoragePaperNote/Deduplication/Distributed-Dedup/TradeoffDataRouting-FAST'11.md
@@ -1,111 +1,111 @@
----
-typora-copy-images-to: ../paper_figure
----
-Tradeoffs in Scalable Data Routing for Deduplication Clusters
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'11 | Distributed Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-To meet increasing requirements, its goal is to design a backup storage system large enough to handle **multiple** primary storage systems.
-> build a deduplication cluster storage system with individual high-throughput nodes.
-> a cluster-wide data reduction ratio close to that of a single very large deduplication system.
-
-- Problem
-1. there is a tension between deduplication effectiveness and throughput.
-> using superchunk.
-
-2. the risk of creating duplicates since the fingerprint index is maintained independently on each node.
-> routing algorithm
-
-3. the system can overload a single node by routing too much data to it.
-> load balancing
-
-
-### Method
-- System Architecture
-
-group chunks into a superchunk , and routes each superchunk to a deduplicating storage node.
-> for high throughput.
-
-- Load balance (rebalancing)
-use a level of indirection called a bin, assign a superchunk to a bin using the mod function.
-> map each bin to a given node.
-
-Bin migration occurs when the storage usage of a node exceeds the average usage in the cluster by some threshold. (defaulting to 5%)
-
-
-- Data Routing
- - stateless routing: light-weight and well suited for most balanced workloads.
- - stateful routing: requiring more overhead but maintain a higher deduplication rate with larger clusters.
-
-1. Stateless routing
-produce a feature value representing the data and then apply a simple function to the value to make the assignment.
-
-how to represent a superchunk?
-use the first, maximum, minimum, or most common fingerprint.
-> suppose the hash values are often uniformly distributed.
-
-**Advantages**
-> 1) reduce overhead for recording node assignments
-> 2) reduce requirements for recovering this state after a system failure.
-
-**Disadvantages**
-> 1) potential loss of deduplication
-> 2) potential for increased data skew if the selected features are not uniformly distributed
-
-2. Stateful routing
-Using information about the location of existing chunks can improve deduplication.
-> use Bloom filter to count the routing algorithm of times each fingerprint in super-chunk is already stored on a given node.
-> vote-based approach, if the highest weighted vote is above a threshold, select that node.
->
-> > voting benefit threshold
-
-To mitigate the overhead of Bloom filter lookup
-> N chunks, M nodes, $\rightarrow$ MN lookups
-
-Sample some chunks for lookup
-> $\frac{MN}{2^B}$, reduce the Bloom filter lookup.
-
-
-
-**Advantages**
-> 1) provide the opportunity to incorporate expected deduplication and capacity balancing while assigning chunks to nodes.
-
-**Disadvantages**
-> 1) increased cost in computation and memory or communication overhead
-
-
-
-### Implementation and Evaluation
-- Evaluation
- - Dataset: 9 datasets
- - Superchunk size: 1MB
- - trace-driven simulation
-
-- Metrics
-1. Total Deduplication
-2. Data skew
-3. Effective Deduplication (ED)
-4. Normalized ED
-5. Fingerprint index lookups
-
-- Feature selection
- - Using 4 features: hash(64) of the first chunk, hash(\*) of the first chunk, the minimum of all hash(64), the minimum of all hash(\*)
-
-
-## 2. Strength (Contributions of the paper)
-1. provide a deep study of the properties of both stateless and stateful versions of super-chunk routing.
-2. extensive experiments to show that sensitive analysis.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. Choosing the right chunk granularity presents a tradeoff between deduplication and system capacity and throughput even in a single-node system.
-2. 1MB superchunk size is good tradeoff between index lookups.
+---
+typora-copy-images-to: ../paper_figure
+---
+Tradeoffs in Scalable Data Routing for Deduplication Clusters
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'11 | Distributed Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+To meet increasing requirements, its goal is to design a backup storage system large enough to handle **multiple** primary storage systems.
+> build a deduplication cluster storage system with individual high-throughput nodes.
+> a cluster-wide data reduction ratio close to that of a single very large deduplication system.
+
+- Problem
+1. there is a tension between deduplication effectiveness and throughput.
+> using superchunk.
+
+2. the risk of creating duplicates since the fingerprint index is maintained independently on each node.
+> routing algorithm
+
+3. the system can overload a single node by routing too much data to it.
+> load balancing
+
+
+### Method
+- System Architecture
+
+group chunks into a superchunk , and routes each superchunk to a deduplicating storage node.
+> for high throughput.
+
+- Load balance (rebalancing)
+use a level of indirection called a bin, assign a superchunk to a bin using the mod function.
+> map each bin to a given node.
+
+Bin migration occurs when the storage usage of a node exceeds the average usage in the cluster by some threshold. (defaulting to 5%)
+
+
+- Data Routing
+ - stateless routing: light-weight and well suited for most balanced workloads.
+ - stateful routing: requiring more overhead but maintain a higher deduplication rate with larger clusters.
+
+1. Stateless routing
+produce a feature value representing the data and then apply a simple function to the value to make the assignment.
+
+how to represent a superchunk?
+use the first, maximum, minimum, or most common fingerprint.
+> suppose the hash values are often uniformly distributed.
+
+**Advantages**
+> 1) reduce overhead for recording node assignments
+> 2) reduce requirements for recovering this state after a system failure.
+
+**Disadvantages**
+> 1) potential loss of deduplication
+> 2) potential for increased data skew if the selected features are not uniformly distributed
+
+2. Stateful routing
+Using information about the location of existing chunks can improve deduplication.
+> use Bloom filter to count the routing algorithm of times each fingerprint in super-chunk is already stored on a given node.
+> vote-based approach, if the highest weighted vote is above a threshold, select that node.
+>
+> > voting benefit threshold
+
+To mitigate the overhead of Bloom filter lookup
+> N chunks, M nodes, $\rightarrow$ MN lookups
+
+Sample some chunks for lookup
+> $\frac{MN}{2^B}$, reduce the Bloom filter lookup.
+
+
+
+**Advantages**
+> 1) provide the opportunity to incorporate expected deduplication and capacity balancing while assigning chunks to nodes.
+
+**Disadvantages**
+> 1) increased cost in computation and memory or communication overhead
+
+
+
+### Implementation and Evaluation
+- Evaluation
+ - Dataset: 9 datasets
+ - Superchunk size: 1MB
+ - trace-driven simulation
+
+- Metrics
+1. Total Deduplication
+2. Data skew
+3. Effective Deduplication (ED)
+4. Normalized ED
+5. Fingerprint index lookups
+
+- Feature selection
+ - Using 4 features: hash(64) of the first chunk, hash(\*) of the first chunk, the minimum of all hash(64), the minimum of all hash(\*)
+
+
+## 2. Strength (Contributions of the paper)
+1. provide a deep study of the properties of both stateless and stateful versions of super-chunk routing.
+2. extensive experiments to show that sensitive analysis.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. Choosing the right chunk granularity presents a tradeoff between deduplication and system capacity and throughput even in a single-node system.
+2. 1MB superchunk size is good tradeoff between index lookups.
> impact deduplication throughput, and effective cluster-wide deduplication.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/GC/AcceleratingRestore-ATC'14.md b/StoragePaperNote/Deduplication/GC/AcceleratingRestore-ATC'14.md
old mode 100644
new mode 100755
index 74f2629..49f01e5
--- a/StoragePaperNote/Deduplication/GC/AcceleratingRestore-ATC'14.md
+++ b/StoragePaperNote/Deduplication/GC/AcceleratingRestore-ATC'14.md
@@ -1,101 +1,101 @@
----
-typora-copy-images-to: ../paper_figure
----
-Accelerating Restore and Garbage Collection in Deduplication-based Backup Systems via Exploiting Historical Information
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ATC'14 | Deduplication Restore and GC |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-
-- Motivation
-1. The **identification of valid chunks** is *cumbersome* and the **merging operation** is the most time-consuming phase in garbage collection.
-> identify valid chunks and the containers holding only a few valid chunks.
-
-Merging operation: copy the valid chunks in the identified containers to new containers, then the identified containers are reclaimed.
-> most time-consuming phase in garbage collection.
-
-2. Most of current approaches rewrite chunks belong to out-of-order containers, which limit their gains in restore performance and garbage collection efficiency.
-
-- Observation
- - reducing sparse containers is important to address the fragmentation problem.
- - two consecutive backups are very similar, and thus historical information collected during the backup is very useful to improve the next backup.
-
-### Method name
-- Fragmentation Classification
-fragmentation come in two categories:
-> sparse containers
-> out-of-order containers
-
-1. Sparse containers
-it defines **container utilization** for a backup as the fraction of its chunks referenced by the backup.
-> define a threshold, if the utilization is smaller than the predefined threshold, regard it as a sparse container.
-> reduce sparse containers can improve the restore performance.
-
-2. Out-of-order container
-A container is accessed many times **intermittently** during a restore, it considers it as an out-of-order container for the restore.
-> cause by the **self-referred** chunk.
-> reduce the restore performance if the cache size is smaller that the cache threshold
-> have no negative impact on garbage collection
-
-
-- Key question
-How to reduce sparse containers becomes the key problem.
-> Observation: sparse containers of the backup remain sparse in the next backup.
-
-
-- System architecture
-
-Use a data structure to track all containers:
-> (container's ID, current utilization, pointer)
-
-- History-aware rewriting algorithm
-**Idea**: rewrites all duplicate chunks in the inherited sparse containers.
-The effect depends on how many backups it needs to retain?
-
-
-
-- Optimal restore cache
-Use Belady's optimal replacement cache
-> need to know the future access pattern
-
-Collect access pattern information during the backup
-> since the sequence of reading chunks during the restore is just the same as the sequence of writing them during a backup.
-> record the container ID
-
-- Container-Marker Algorithm
-HAR naturally accelerates expirations of sparse containers and thus the merging is no longer necessary.
-> For each container, CMA maintains a dataset list that records IDs of the datasets referring to the container.
-> If a container's list is empty, the container can be reclaimed.
-
-### Implementation and Evaluation
-- Dataset:
- - VMDK, Linux, synthetic dataset
-
-- Metric
-1. average utilization
-2. deduplication ratio: outperform CBR and CAP in terms of backup performance.
-3. restore performance: speed factor
-4. garbage collection: reference management overhead, valid containers
-5. varying the utilization threshold: deduplication ratio, restore performance
-
-## 2. Strength (Contributions of the paper)
-
-1. this paper classifies the fragmentation into two categories: out-of-order and sparse containers.
-> out-of-order containers reduce restore performance.
-> sparse containers: reduce both restore performance and garbage collection efficiency.
-
-2. it proposes History-Aware Rewriting algorithm (HAR) to accurately identify and reduce sparse containers.
-
-3. it also proposes Container-Marker algorithm (CMA) to reduce the metadata overhead of garbage collection.
-> identifies valid containers instead of valid chunks in garbage collection.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights
-1. The key of HAR is to maintain high utilizations of containers.
+---
+typora-copy-images-to: ../paper_figure
+---
+Accelerating Restore and Garbage Collection in Deduplication-based Backup Systems via Exploiting Historical Information
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ATC'14 | Deduplication Restore and GC |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+
+- Motivation
+1. The **identification of valid chunks** is *cumbersome* and the **merging operation** is the most time-consuming phase in garbage collection.
+> identify valid chunks and the containers holding only a few valid chunks.
+
+Merging operation: copy the valid chunks in the identified containers to new containers, then the identified containers are reclaimed.
+> most time-consuming phase in garbage collection.
+
+2. Most of current approaches rewrite chunks belong to out-of-order containers, which limit their gains in restore performance and garbage collection efficiency.
+
+- Observation
+ - reducing sparse containers is important to address the fragmentation problem.
+ - two consecutive backups are very similar, and thus historical information collected during the backup is very useful to improve the next backup.
+
+### Method name
+- Fragmentation Classification
+fragmentation come in two categories:
+> sparse containers
+> out-of-order containers
+
+1. Sparse containers
+it defines **container utilization** for a backup as the fraction of its chunks referenced by the backup.
+> define a threshold, if the utilization is smaller than the predefined threshold, regard it as a sparse container.
+> reduce sparse containers can improve the restore performance.
+
+2. Out-of-order container
+A container is accessed many times **intermittently** during a restore, it considers it as an out-of-order container for the restore.
+> cause by the **self-referred** chunk.
+> reduce the restore performance if the cache size is smaller that the cache threshold
+> have no negative impact on garbage collection
+
+
+- Key question
+How to reduce sparse containers becomes the key problem.
+> Observation: sparse containers of the backup remain sparse in the next backup.
+
+
+- System architecture
+
+Use a data structure to track all containers:
+> (container's ID, current utilization, pointer)
+
+- History-aware rewriting algorithm
+**Idea**: rewrites all duplicate chunks in the inherited sparse containers.
+The effect depends on how many backups it needs to retain?
+
+
+
+- Optimal restore cache
+Use Belady's optimal replacement cache
+> need to know the future access pattern
+
+Collect access pattern information during the backup
+> since the sequence of reading chunks during the restore is just the same as the sequence of writing them during a backup.
+> record the container ID
+
+- Container-Marker Algorithm
+HAR naturally accelerates expirations of sparse containers and thus the merging is no longer necessary.
+> For each container, CMA maintains a dataset list that records IDs of the datasets referring to the container.
+> If a container's list is empty, the container can be reclaimed.
+
+### Implementation and Evaluation
+- Dataset:
+ - VMDK, Linux, synthetic dataset
+
+- Metric
+1. average utilization
+2. deduplication ratio: outperform CBR and CAP in terms of backup performance.
+3. restore performance: speed factor
+4. garbage collection: reference management overhead, valid containers
+5. varying the utilization threshold: deduplication ratio, restore performance
+
+## 2. Strength (Contributions of the paper)
+
+1. this paper classifies the fragmentation into two categories: out-of-order and sparse containers.
+> out-of-order containers reduce restore performance.
+> sparse containers: reduce both restore performance and garbage collection efficiency.
+
+2. it proposes History-Aware Rewriting algorithm (HAR) to accurately identify and reduce sparse containers.
+
+3. it also proposes Container-Marker algorithm (CMA) to reduce the metadata overhead of garbage collection.
+> identifies valid containers instead of valid chunks in garbage collection.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights
+1. The key of HAR is to maintain high utilizations of containers.
2. In GC, the reference management is the key requirement, and HAR manages the reference in container level instead of chunk level.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/GC/GC-FAST'17.md b/StoragePaperNote/Deduplication/GC/GC-FAST'17.md
old mode 100644
new mode 100755
index f8ba0a0..d53be01
--- a/StoragePaperNote/Deduplication/GC/GC-FAST'17.md
+++ b/StoragePaperNote/Deduplication/GC/GC-FAST'17.md
@@ -1,102 +1,102 @@
----
-typora-copy-images-to: ../paper_figure
----
-The Logic of Physical Garbage Collection in Deduplicating Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'17 | Deduplication GC |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- In a deduplicating storage system, GC is complicated by possibility of numerous references to the same underlying data.
-
-- *Logic level GC*: the system analyzes each live file to determine the set of live chunks in the storage system.
-> this cannot handle the shift to using individual file-level backups, rather than tar-like aggregates (the number of files increased dramatically) **high overhead in mark phase**
-
-This motivates this paper to propose *physical level GC*, which performs a series of sequential passes through the **storage containers**.
-> I/O pattern is sequential
-> it is scales with the physical capacity rather than the deduplication
-
-### Scalable GC
-- Assumption:
-While some deduplication research assumed a FIFO deletion pattern, file deletion can generally be in **any order**.
-
-- Deuplicated file representation
-using Merkle trees for deduplicated storage to represent a file.
-
-- Performance issues with enumeration
-1. Deduplication and compression:
-some datasets have extremely high $TC$, making the logical space vary large and unreasonably slow to enumerate.
-
-2. Number of files:
-Every file has metadata overhead to be processed when enumerating a file, and for small files, the overhead may be as large as the enumeration time itself.
-
-3. Spatial locality:
-While sequentially written files tend to have high locality of $L_p$ chunks, creating a new file from incremental changes harms locality as enumeration has to jump among branches of the $L_p$ tree.
-
-- Original Logical GC
-1. **Enumeration**: To identify the live chunks, it enumerates all of the files referenced from the root ($L_1$ to $L_6$ chunks). **Depth-first** walk and mark live chunks.
-> record the fingerprints for live files in the **Live Bloom Filter**.
-> track the most recent instance of each chunk in the presence of duplicates in the **Live Instance Bloom Filter**.
-
-2. **Filter**: Check each container via iterating through the container metadata region, clean on those where the most space can be reclaimed with the least effort.
-> liveness of the container: the number of live fingerprints divided by the total number of fingerprints in the containers.
-
-3. **Select**: select best containers to compact.
-
-4. **Copy**: new containers are formed from live chunks copied out of previous containers.
-
-- Physical GC
-Uses **breadth-first** walk instead of per-file depth-first walk during enumeration.
-
-A new method of enumeration identifies live chunks from **containers rather than by iterating through each file**.
-> via using perfect hashing, use less memory
-> using PHV to store $L_p$ for assisting breadth-first walk.
-
-1. **Analysis**: create the PH function by analyzing the fingerprints in the on-disk index.
-> unique mapping from fingerprint to offset in a PHV. (4 bits per fingerprint)
-
-2. **Enumeration**: Unlike LGC, instead of walking the file tree structure, it performs a series of sequential container scans.
-
-3. **Filter, select, copy**
-
-- Physical GC+
-Replace Bloom filter with Perfect Hash vector for tracking live and dead chunks.
-> in analysis phase build two perfect hash vectors.
-
-
-### Implementation and Evaluation
-- Evaluation
-Comparison of GC runs for systems upgraded from LGC to PGC
-> 1. Standard workloads
-> 2. Problematic workloads
-
-1. GC on different platforms
-> GC duration
-
-For high TC workloads, PGC improved from LGC up to 20x
-For high file count workload, PGC improved over LGC by 7x
-
-## 2. Strength (Contributions of the paper)
-1. It mentions two approaches to garbage collection in a deduplicating storage system.
-> handle different workload.
-> make enumeration time
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-1. This paper mentions two classical deletion algorithms, reference counts and mark-and-sweep.
-> 1. reference count: maintain a count for each chunk. (inline approach)
-> 2. mark-and-sweep: walk all live files and marks them in a data structure. Then, scans the containers and sweeps unreferenced chunks. (asynchronous algorithm). live chunks are read from disk and written to new containers.
-
-2. This paper mentions two trends in deduplication system.
-> 1. the increase in the file count. (treating it more like primary storage than backup)
-> 2. deduplication rate goes up (more frequent point-in-time backups)
-
-
-3. For the benefits and costs of physical enumeration
-Pro: it can perform sequential scan of containers on disk
-Con: extra analysis cost doesn't help traditional workloads
-
+---
+typora-copy-images-to: ../paper_figure
+---
+The Logic of Physical Garbage Collection in Deduplicating Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'17 | Deduplication GC |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- In a deduplicating storage system, GC is complicated by possibility of numerous references to the same underlying data.
+
+- *Logic level GC*: the system analyzes each live file to determine the set of live chunks in the storage system.
+> this cannot handle the shift to using individual file-level backups, rather than tar-like aggregates (the number of files increased dramatically) **high overhead in mark phase**
+
+This motivates this paper to propose *physical level GC*, which performs a series of sequential passes through the **storage containers**.
+> I/O pattern is sequential
+> it is scales with the physical capacity rather than the deduplication
+
+### Scalable GC
+- Assumption:
+While some deduplication research assumed a FIFO deletion pattern, file deletion can generally be in **any order**.
+
+- Deuplicated file representation
+using Merkle trees for deduplicated storage to represent a file.
+
+- Performance issues with enumeration
+1. Deduplication and compression:
+some datasets have extremely high $TC$, making the logical space vary large and unreasonably slow to enumerate.
+
+2. Number of files:
+Every file has metadata overhead to be processed when enumerating a file, and for small files, the overhead may be as large as the enumeration time itself.
+
+3. Spatial locality:
+While sequentially written files tend to have high locality of $L_p$ chunks, creating a new file from incremental changes harms locality as enumeration has to jump among branches of the $L_p$ tree.
+
+- Original Logical GC
+1. **Enumeration**: To identify the live chunks, it enumerates all of the files referenced from the root ($L_1$ to $L_6$ chunks). **Depth-first** walk and mark live chunks.
+> record the fingerprints for live files in the **Live Bloom Filter**.
+> track the most recent instance of each chunk in the presence of duplicates in the **Live Instance Bloom Filter**.
+
+2. **Filter**: Check each container via iterating through the container metadata region, clean on those where the most space can be reclaimed with the least effort.
+> liveness of the container: the number of live fingerprints divided by the total number of fingerprints in the containers.
+
+3. **Select**: select best containers to compact.
+
+4. **Copy**: new containers are formed from live chunks copied out of previous containers.
+
+- Physical GC
+Uses **breadth-first** walk instead of per-file depth-first walk during enumeration.
+
+A new method of enumeration identifies live chunks from **containers rather than by iterating through each file**.
+> via using perfect hashing, use less memory
+> using PHV to store $L_p$ for assisting breadth-first walk.
+
+1. **Analysis**: create the PH function by analyzing the fingerprints in the on-disk index.
+> unique mapping from fingerprint to offset in a PHV. (4 bits per fingerprint)
+
+2. **Enumeration**: Unlike LGC, instead of walking the file tree structure, it performs a series of sequential container scans.
+
+3. **Filter, select, copy**
+
+- Physical GC+
+Replace Bloom filter with Perfect Hash vector for tracking live and dead chunks.
+> in analysis phase build two perfect hash vectors.
+
+
+### Implementation and Evaluation
+- Evaluation
+Comparison of GC runs for systems upgraded from LGC to PGC
+> 1. Standard workloads
+> 2. Problematic workloads
+
+1. GC on different platforms
+> GC duration
+
+For high TC workloads, PGC improved from LGC up to 20x
+For high file count workload, PGC improved over LGC by 7x
+
+## 2. Strength (Contributions of the paper)
+1. It mentions two approaches to garbage collection in a deduplicating storage system.
+> handle different workload.
+> make enumeration time
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. This paper mentions two classical deletion algorithms, reference counts and mark-and-sweep.
+> 1. reference count: maintain a count for each chunk. (inline approach)
+> 2. mark-and-sweep: walk all live files and marks them in a data structure. Then, scans the containers and sweeps unreferenced chunks. (asynchronous algorithm). live chunks are read from disk and written to new containers.
+
+2. This paper mentions two trends in deduplication system.
+> 1. the increase in the file count. (treating it more like primary storage than backup)
+> 2. deduplication rate goes up (more frequent point-in-time backups)
+
+
+3. For the benefits and costs of physical enumeration
+Pro: it can perform sequential scan of containers on disk
+Con: extra analysis cost doesn't help traditional workloads
+
diff --git a/StoragePaperNote/Deduplication/GC/MemorySanitization-FAST'13.md b/StoragePaperNote/Deduplication/GC/MemorySanitization-FAST'13.md
old mode 100644
new mode 100755
index fd5f76f..f78257a
--- a/StoragePaperNote/Deduplication/GC/MemorySanitization-FAST'13.md
+++ b/StoragePaperNote/Deduplication/GC/MemorySanitization-FAST'13.md
@@ -1,73 +1,73 @@
----
-typora-copy-images-to: ../paper_figure
----
-Memory Efficient Sanitization of a Deduplicated Storage System
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'13 | Deduplication Sanitization |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-- Motivation
-Securely erasing sensitive data from a storage system (sanitization could require erasing all unreferenced blocks)
-> each piece of data on the physical media could be referred to by multiple namespace objects.
-> standard techniques for tracking data references will not fit in memory
-
-- Key requirement for sanitization
-> 1. all deleted data are erased
-> 2. all live data are available
-> 3. the whole sanitization process is efficient
-> 4. the storage system is usable while sanitization runs
-
-### Sanitization
-- Threat model
-1. casual attack
-> access through regular file system interfaces.
-
-2. Robust keyboard attack
-> reading blocks directly from disk, swap area, or unallocated blocks
-> using non-regular interfaces.
-
-3. Laboratory attack
-> access through exotic laboratory techniques
-> require specific disk format
-
-- Managing chunk references
-1. Bloom filter
-2. Bit vector
-3. Perfect hash vector
-**key requirement:** need a compact representation of a set of fingerprints that provides an exact answer for whether a given fingerprint exists in the set or not.
-> suppose it is a static version of the membership problem where the key space is known beforehand.
-> no dynamic insertion or deletion of keys.
-
-those two points support to leverage perfect hash vector.
-
-- Sanitization process
-For read-only file system:
-> 1. Merge phase: set the consistency point, flush the in-memory fingerprint index buffer and merge it with the on-disk index.
-> 2. Traverse the on-disk index for all fingerprints and build the perfect hash function for all fingerprints found
-> 3. Traverse all files and mark all fingerprints found as live in perfect hash vector
-> 4. select containers with at least one dead chunk, and copy all live chunks from the selected containers into new containers (copy forward), and delete the selected containers.
-
-
-### Implementation and Evaluation
-- Evaluation:
- - without deduplication: exclude the deduplication impact on the sanitization process (as the baseline)
- - with deduplication: deleted space vs sanitization time
- - impact on ingests: the performance when both sanitization and data ingestion run concurrently.
-
-## 2. Strength (Contributions of the paper)
-1. discuss sanitization requirements in the context of deduplicated storage.
-2. it proposes some memory efficient schemes for managing data references via using perfect hashes.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-1. In this paper, it mentions the **crypto sanitization**, which encrypts each file with a different key and throws away the key of the affected files. Is it feasible to adjust this scheme to deduplication system.
-2. Here, it also uses perfect hash to represent the membership, and show it is memory efficient. How to adjust this technique to our problem?
-
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Memory Efficient Sanitization of a Deduplicated Storage System
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'13 | Deduplication Sanitization |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+- Motivation
+Securely erasing sensitive data from a storage system (sanitization could require erasing all unreferenced blocks)
+> each piece of data on the physical media could be referred to by multiple namespace objects.
+> standard techniques for tracking data references will not fit in memory
+
+- Key requirement for sanitization
+> 1. all deleted data are erased
+> 2. all live data are available
+> 3. the whole sanitization process is efficient
+> 4. the storage system is usable while sanitization runs
+
+### Sanitization
+- Threat model
+1. casual attack
+> access through regular file system interfaces.
+
+2. Robust keyboard attack
+> reading blocks directly from disk, swap area, or unallocated blocks
+> using non-regular interfaces.
+
+3. Laboratory attack
+> access through exotic laboratory techniques
+> require specific disk format
+
+- Managing chunk references
+1. Bloom filter
+2. Bit vector
+3. Perfect hash vector
+**key requirement:** need a compact representation of a set of fingerprints that provides an exact answer for whether a given fingerprint exists in the set or not.
+> suppose it is a static version of the membership problem where the key space is known beforehand.
+> no dynamic insertion or deletion of keys.
+
+those two points support to leverage perfect hash vector.
+
+- Sanitization process
+For read-only file system:
+> 1. Merge phase: set the consistency point, flush the in-memory fingerprint index buffer and merge it with the on-disk index.
+> 2. Traverse the on-disk index for all fingerprints and build the perfect hash function for all fingerprints found
+> 3. Traverse all files and mark all fingerprints found as live in perfect hash vector
+> 4. select containers with at least one dead chunk, and copy all live chunks from the selected containers into new containers (copy forward), and delete the selected containers.
+
+
+### Implementation and Evaluation
+- Evaluation:
+ - without deduplication: exclude the deduplication impact on the sanitization process (as the baseline)
+ - with deduplication: deleted space vs sanitization time
+ - impact on ingests: the performance when both sanitization and data ingestion run concurrently.
+
+## 2. Strength (Contributions of the paper)
+1. discuss sanitization requirements in the context of deduplicated storage.
+2. it proposes some memory efficient schemes for managing data references via using perfect hashes.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. In this paper, it mentions the **crypto sanitization**, which encrypts each file with a different key and throws away the key of the affected files. Is it feasible to adjust this scheme to deduplication system.
+2. Here, it also uses perfect hash to represent the membership, and show it is memory efficient. How to adjust this technique to our problem?
+
+
diff --git a/StoragePaperNote/Deduplication/Mem-Dedup/HintsDeduplication-FAST'16.md b/StoragePaperNote/Deduplication/Mem-Dedup/HintsDeduplication-FAST'16.md
old mode 100644
new mode 100755
index 7e523b1..539c60b
--- a/StoragePaperNote/Deduplication/Mem-Dedup/HintsDeduplication-FAST'16.md
+++ b/StoragePaperNote/Deduplication/Mem-Dedup/HintsDeduplication-FAST'16.md
@@ -1,79 +1,79 @@
----
-typora-copy-images-to: ../paper_figure
----
-Using Hints to Improve Inline Block-Layer Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'16 | Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Important information about data context (e.g. data vs. metadata writes) is lost at the block layer.
-> This paper argues passing such context to the block layer can help improve the deduplication performance and reliability.
-> The root cause: the **semantic divide** between the block layer and file systems
-
-
-
-This paper proposes to design the interface in block-layer deduplication system, which can allow upper storage layers to pass hints based on the available context.
-> Most of existing deduplication solutions are built into file systems because they have enough information to deduplicate efficiently without jeopardizing reliability. This information can be leveraged to avoid deduplicating certain blocks (e.g., metadata)
-
-### Block-layer deduplication hints
-- Hints
-Hinting asks higher layers to provide small amounts to extra information to the deduplication. In this paper, it uses hinting to recover context at the block layer.
-
-- Two main advantages in block-layer deduplication
-1. allowing nay file system and application to benefit from deduplication
-2. ease of implementation
-
-- Potential Hints
-1. Bypass deduplication (**NODEDUP**)
-Main idea: some writes are known a priori to be likely to be **unique**. Attempting to deduplicate unique writes wastes CPU time on hash computation and I/O bandwidth on maintaining the hash index.
-> application requirement: generate data should not be duplicated. (random data or encrypted data)
-> Overhead: hash computation, index size, more RAM space, more lookup bandwidth.
-> main issue: unique data and reliability
-
-
-For **metadata**:
-Most file system metadata is unique
-> metadata writes are more important to overall system performance than data writes becasue the former are oftern synchronous.
-> add deduplication to metadata might increase the latency of those critical metadata writes.
-> reliability: duplicates metadata to avoid corruption.
-
-2. Prefetch hashes (**PREFETCH**)
-When a block layer deduplication knows what data is about to be written, it can prefetch the corresponding hashes from the index
-> accelerating future data writes by reducing lookup delays.
-> inform the deduplication system of I/O operations that are likely to generate further duplicates (copy file)
-> their hashes can be prefetched and cached to minimize random accesses.
-
-3. Other
-Bypass compression, cluster hashes, partitioned hash index, intelligent chunking
-> cluster hashes: files that reside in the same directory tend to be accessed together.
-
-
-### Implementation and Evaluation
-- Implementation
-Modify the write path and read path.
-
-
-
-- Evaluation
-1. NODEDUP: observe the elapsed time in four file systems
-> no-hint v.s. hint-on
-
-2. PREFETHCH: observe the elapsed time in four file systems
-> no-hint v.s. hint-on
-
-3. Throughput: using Filebench
-
-## 2. Strength (Contributions of the paper)
-1. This paper states that if a block-level deduplication system can know when it is unwise to deduplicate a write, it can optimize its performance and reliability.
-2. This method can be useful when writing unique data (i.e., avoid wastage of resources) or need to store duplicate chunks for reliability.
-
-## 3. Weakness (Limitations of the paper)
-1. In my opinion, the idea in this paper is just to deliever the context information to the block-layer to let block-layer deduplication do better.
-
-## 4. Future Works
-1. This work mentions that its initial experiments result is successful, and it can add more hints to provide more information in block-layer
+---
+typora-copy-images-to: ../paper_figure
+---
+Using Hints to Improve Inline Block-Layer Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'16 | Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Important information about data context (e.g. data vs. metadata writes) is lost at the block layer.
+> This paper argues passing such context to the block layer can help improve the deduplication performance and reliability.
+> The root cause: the **semantic divide** between the block layer and file systems
+
+
+
+This paper proposes to design the interface in block-layer deduplication system, which can allow upper storage layers to pass hints based on the available context.
+> Most of existing deduplication solutions are built into file systems because they have enough information to deduplicate efficiently without jeopardizing reliability. This information can be leveraged to avoid deduplicating certain blocks (e.g., metadata)
+
+### Block-layer deduplication hints
+- Hints
+Hinting asks higher layers to provide small amounts to extra information to the deduplication. In this paper, it uses hinting to recover context at the block layer.
+
+- Two main advantages in block-layer deduplication
+1. allowing nay file system and application to benefit from deduplication
+2. ease of implementation
+
+- Potential Hints
+1. Bypass deduplication (**NODEDUP**)
+Main idea: some writes are known a priori to be likely to be **unique**. Attempting to deduplicate unique writes wastes CPU time on hash computation and I/O bandwidth on maintaining the hash index.
+> application requirement: generate data should not be duplicated. (random data or encrypted data)
+> Overhead: hash computation, index size, more RAM space, more lookup bandwidth.
+> main issue: unique data and reliability
+
+
+For **metadata**:
+Most file system metadata is unique
+> metadata writes are more important to overall system performance than data writes becasue the former are oftern synchronous.
+> add deduplication to metadata might increase the latency of those critical metadata writes.
+> reliability: duplicates metadata to avoid corruption.
+
+2. Prefetch hashes (**PREFETCH**)
+When a block layer deduplication knows what data is about to be written, it can prefetch the corresponding hashes from the index
+> accelerating future data writes by reducing lookup delays.
+> inform the deduplication system of I/O operations that are likely to generate further duplicates (copy file)
+> their hashes can be prefetched and cached to minimize random accesses.
+
+3. Other
+Bypass compression, cluster hashes, partitioned hash index, intelligent chunking
+> cluster hashes: files that reside in the same directory tend to be accessed together.
+
+
+### Implementation and Evaluation
+- Implementation
+Modify the write path and read path.
+
+
+
+- Evaluation
+1. NODEDUP: observe the elapsed time in four file systems
+> no-hint v.s. hint-on
+
+2. PREFETHCH: observe the elapsed time in four file systems
+> no-hint v.s. hint-on
+
+3. Throughput: using Filebench
+
+## 2. Strength (Contributions of the paper)
+1. This paper states that if a block-level deduplication system can know when it is unwise to deduplicate a write, it can optimize its performance and reliability.
+2. This method can be useful when writing unique data (i.e., avoid wastage of resources) or need to store duplicate chunks for reliability.
+
+## 3. Weakness (Limitations of the paper)
+1. In my opinion, the idea in this paper is just to deliever the context information to the block-layer to let block-layer deduplication do better.
+
+## 4. Future Works
+1. This work mentions that its initial experiments result is successful, and it can add more hints to provide more information in block-layer
> provide richer context to the block layer.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Mem-Dedup/UKSM-FAST'18.md b/StoragePaperNote/Deduplication/Mem-Dedup/UKSM-FAST'18.md
old mode 100644
new mode 100755
index 300fc7b..6e86afd
--- a/StoragePaperNote/Deduplication/Mem-Dedup/UKSM-FAST'18.md
+++ b/StoragePaperNote/Deduplication/Mem-Dedup/UKSM-FAST'18.md
@@ -1,77 +1,77 @@
----
-typora-copy-images-to: paper_figure
----
-UKSM: Swift Memory Deduplication via Hierarchical and Adaptive Memory Region Distilling
-------------------------------------------
-| Venue | Category |
-| :---: | :------: |
-| FAST'18 | Memory Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-The responsiveness of the memory deduplication process is very important to newly generated pages. If the deduplication approach cannot catch up with the generation speed of memory redundancy, memory pages would be swapped out to the disk, and the whole system is slowed down.
-> State-of-art Content Based Page Sharing (CBPS) approaches **lack reponsiveness** as they equally scan every page to find redundancies.
-
-- Two key observations
-1. Most pages within the same regio present similar duplication pattern.
-2. Partial page hashing need to be adaptive.
-
-### UKSM
-- Main idea:
-> 1. do not treat each page equally, prioritizes different memory regions (with different CPU cycles, different sample intervals) to accelerate the deduplication process.
-> 2. To identify whether two pages are same, partial page hashing needs to be adaptive to the dynamic workload (to achieve more cost-effective scanning).
-
-#### Technology 1: Hierarchical Region Distilling
-
-The higher the level is, the higher the page scanning frequency is.
-
-- Key characteristics for duplication qualities for each memory region
-> duplication ratio, average page lifetime, copy-on-write ratio
-> those parameters can be collected from each round of sample
-
-- UKSM can set threshold values for each of those parameters to decide whether this memory region can be promoted or demoted to other levels.
-> those thresholds are configurable, and can be set by the users depending on the targeted workload.
-
-- Hierarchical sampling procedure
-**Scan a level**: For each region in different levels, it scans and samples the page according to the length of interval.
-> a higher level possesses a smaller interval
-
-once a page is picked, it calculates the hash value of it according to current hash strength (bytes hashed in each page).
-> compare the hash value in two red-black trees and updates the counter of that region. (oen tree for management of merged page hashes, and one for management of unmerged page hashes)
-
-After each **global sampling round**, the scanner estimates each region's duplication and COW-broken ratios.
-
-#### Technology 2: Adaptive Partial Hashing
-Goal: reduce per-page scan and deduplication cost (by partially hash a page).
-> **Rational**: the hash value is already sufficient to distinguish different pages, it does not need to hash a full page.
-
-For its algorithm, it plans to finds the optimal strength for hte hash function (Global hashing strength)
-> profit: a weak hash strength can save more time
-> penalty: a weak hash can increase the possibility of false positive, results in additional overhead of **memcmp**.
-
-### Implementation and Evaluation
-- Implement in Linux and Xen
-> Around 6,000 lines of C code.
-> Linux: UKSM hooks the linux kernel memory management subsystem for monitoring the creation and termination of application memory regions.
-
-- Evaluation
-1. Memory Saving, CPU Consumption, Deduplication Speed
-Emulated workloads and real-word workloads, compare with the **SuperFast Hash** of the natvie Linux KSM scanner.
-> Emulated workload: emulate three kinds of workload: statically mixed workload, COW-broken workload, short-lived workload.
-> Real-word workload: compare with XLH in ATC'13, Docker containers, desktop server.
-
-2. Analysis of Adaptive Partial Hashing
-> Scanning speed and deduplication speed on different degree redundant regions.
-> the strength of hash function for different benchmark workloads
-
-## 2. Strength (Contributions of the paper)
-1. This paper does a lots of experiments to prove different workloads can lead to different trade-off between memory saving and cpu consumption , as well as performance impact. This can make its motivation more strong.
-## 3. Weakness (Limitations of the paper)
-1. In this paper, it does not investigate how to configure those parameters for different types of workload.
-2. For its part of partial hashing, I cannot understand the rationale behind it easily. And its adaptive partial hashing function is based on SuperFastHash, it can also consider other hash functions, and whether those types hash function can be adaptive.
-## 4. Future Works
-This paper targets memory deduplcation, which emphazies on responsiveness. How about back-up workload deduplication?
-
-Can we try to classifiy different region, and for different region, we adopt different strategies to improve the performance. (Motivation?)
-
+---
+typora-copy-images-to: paper_figure
+---
+UKSM: Swift Memory Deduplication via Hierarchical and Adaptive Memory Region Distilling
+------------------------------------------
+| Venue | Category |
+| :---: | :------: |
+| FAST'18 | Memory Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+The responsiveness of the memory deduplication process is very important to newly generated pages. If the deduplication approach cannot catch up with the generation speed of memory redundancy, memory pages would be swapped out to the disk, and the whole system is slowed down.
+> State-of-art Content Based Page Sharing (CBPS) approaches **lack reponsiveness** as they equally scan every page to find redundancies.
+
+- Two key observations
+1. Most pages within the same regio present similar duplication pattern.
+2. Partial page hashing need to be adaptive.
+
+### UKSM
+- Main idea:
+> 1. do not treat each page equally, prioritizes different memory regions (with different CPU cycles, different sample intervals) to accelerate the deduplication process.
+> 2. To identify whether two pages are same, partial page hashing needs to be adaptive to the dynamic workload (to achieve more cost-effective scanning).
+
+#### Technology 1: Hierarchical Region Distilling
+
+The higher the level is, the higher the page scanning frequency is.
+
+- Key characteristics for duplication qualities for each memory region
+> duplication ratio, average page lifetime, copy-on-write ratio
+> those parameters can be collected from each round of sample
+
+- UKSM can set threshold values for each of those parameters to decide whether this memory region can be promoted or demoted to other levels.
+> those thresholds are configurable, and can be set by the users depending on the targeted workload.
+
+- Hierarchical sampling procedure
+**Scan a level**: For each region in different levels, it scans and samples the page according to the length of interval.
+> a higher level possesses a smaller interval
+
+once a page is picked, it calculates the hash value of it according to current hash strength (bytes hashed in each page).
+> compare the hash value in two red-black trees and updates the counter of that region. (oen tree for management of merged page hashes, and one for management of unmerged page hashes)
+
+After each **global sampling round**, the scanner estimates each region's duplication and COW-broken ratios.
+
+#### Technology 2: Adaptive Partial Hashing
+Goal: reduce per-page scan and deduplication cost (by partially hash a page).
+> **Rational**: the hash value is already sufficient to distinguish different pages, it does not need to hash a full page.
+
+For its algorithm, it plans to finds the optimal strength for hte hash function (Global hashing strength)
+> profit: a weak hash strength can save more time
+> penalty: a weak hash can increase the possibility of false positive, results in additional overhead of **memcmp**.
+
+### Implementation and Evaluation
+- Implement in Linux and Xen
+> Around 6,000 lines of C code.
+> Linux: UKSM hooks the linux kernel memory management subsystem for monitoring the creation and termination of application memory regions.
+
+- Evaluation
+1. Memory Saving, CPU Consumption, Deduplication Speed
+Emulated workloads and real-word workloads, compare with the **SuperFast Hash** of the natvie Linux KSM scanner.
+> Emulated workload: emulate three kinds of workload: statically mixed workload, COW-broken workload, short-lived workload.
+> Real-word workload: compare with XLH in ATC'13, Docker containers, desktop server.
+
+2. Analysis of Adaptive Partial Hashing
+> Scanning speed and deduplication speed on different degree redundant regions.
+> the strength of hash function for different benchmark workloads
+
+## 2. Strength (Contributions of the paper)
+1. This paper does a lots of experiments to prove different workloads can lead to different trade-off between memory saving and cpu consumption , as well as performance impact. This can make its motivation more strong.
+## 3. Weakness (Limitations of the paper)
+1. In this paper, it does not investigate how to configure those parameters for different types of workload.
+2. For its part of partial hashing, I cannot understand the rationale behind it easily. And its adaptive partial hashing function is based on SuperFastHash, it can also consider other hash functions, and whether those types hash function can be adaptive.
+## 4. Future Works
+This paper targets memory deduplcation, which emphazies on responsiveness. How about back-up workload deduplication?
+
+Can we try to classifiy different region, and for different region, we adopt different strategies to improve the performance. (Motivation?)
+
diff --git a/StoragePaperNote/Deduplication/Metadata-Management/FileRecipeCompression-FAST'13.md b/StoragePaperNote/Deduplication/Metadata-Management/FileRecipeCompression-FAST'13.md
old mode 100644
new mode 100755
index dcd9bcf..76292bf
--- a/StoragePaperNote/Deduplication/Metadata-Management/FileRecipeCompression-FAST'13.md
+++ b/StoragePaperNote/Deduplication/Metadata-Management/FileRecipeCompression-FAST'13.md
@@ -1,104 +1,104 @@
----
-typora-copy-images-to: paper_figure
----
-File Recipe Compression in Data Deduplication Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'13 | Metadata Compression |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The corresponding file recipe data can occupy a significant fraction of the total disk space.
-> if the deduplication ratio is very high
-
-This paper proposes a combination of efficient and scalable compression schemes to shrink the file recipes's size.
-> Chunk index: disk bottleneck
-> file recipes: contains a list of chunk identifiers.
-
-
-In a standard file system, the block pointer is stored in at most **64 bits**.
-> file recipes that store cryptographic fingerprints have a size of at least **20 bytes**
-
-### File Recipe Compression
-- File recipies may occupy a significant portion of the overall storage in a backup deduplication systems.
-> the file recipes grow linearly with the size of the stored logical data.
-
-- One way to reduce the file recipes's size is to use larger chunk sizes.
-> this decreases the deduplication ratio.
-
-- Key Idea:
-Assign a small code words to fingerprints.
-> the code word is then stored instead of the fingerprint in the file recipe.
-
-And this paper wants to investigate different approaches to select the code words.
-
-- Method 1: Zero-Chunk Suppression
-Observation: a few chunks are responsible for a high number of duplicates
-> Zero chunk: the chunk completely filled with zeros.
-> zero chunks are common in VM disk images.
-
-It is easy to detect zero chunks and replace them with a **special code word** by pre-calculating the fingerprint of the chunk filled with zeros.
-
-- Method 2: Chunk Index Page-oriented Approach
-Aims to assign a code word to all chunks
-> is not significantly longer than necessary to have a unique code word for each chunk in the system
-
-This approach uses chunk index's pages to assign code words.
-> code word consists of two parts: prefix and suffix. (page id | unique identifier in page)
-> The combination of suffix and prefix together uniquely identify the fingerprint.
-> prefix: used to identify the index on-disk page where the fingerprint is stored on.
-> suffix: in the on-disk page, it can search the fingerprint entry with the matching suffix.
-
-
-
-
-- Method 3: Statistical Dictionary Compression
-This method generalizes the zero chunk suppression.
-> 1. assign shorter code words to fingerprints based on the probabilies of the chunk usages.
-> 2. the usage of fingerprints is **highly skewed**, certain fingerprints are more likely than others.
-> 3. A **small set** of common fingerprints gets a short code word.
-
-Assign code words to chunks below a entropy threshold:
-> E.g., around 0.3% of the chunks
-> Need to maintain (in-memory) reverse index`
-
-The statistical model relies on a fingerprint's usage without looking at its context. (e.g., previous fingerprints in the data stream)
-> use fingerprint entropy to measure the length of code word.
-
-**Issue**
-the entropy of each chunk is not known in advance and can only be estimated after some data has been written.
-> only assign code words after one or more backups. (The first fingerprints have to be stored unmodified)
-> This paper assumes it is effectively possible to estimate the overall probability of a chunk. (garbage collection)
-
-
-- Method 4: Statistical Prediction Compression
-For each chunk
-1. Determine most-likely following fingerprint
-> Based on order-1 statistic
-> Data structure for order-1 too large at scale
-
-2. Select as prediction fingerprints
-3. Replace correct prediction by 1 byte code
-
-Using estimation of most-frequent following fingerprints
-> This paper mentions it uses the Misra-Gries Algorithms to approximate the detection of frequent fingerprint pairs.
-
-### Implementation and Evaluation
-Evaluation: Trace-based simulation of weekly full backup, three datasets
-> Compression Ratio: Pages-Based and Statistical Prediction
-> Combined compression ratio: four methods
-
-## 2. Strength (Contributions of the paper)
-1. This paper proposes some new compression approaches for file recipes. And the combination approaches can shrinks file recipes by more than $90\%$
-## 3. Weakness (Limitations of the paper)
-1. The writing of this paper is not very clear, I cannot fully understand the last two methods easily.
-2. In the statistical dictionary compression, it mentions its method is similar to Huffman coding. However, in order to implement this method to billions of chunks, it needs to relax to non-optimal.
-3. For non-backup systems, file recipe overheah and compression may not be important.
-## 4. Future Works
-1. In this paper, it mentions how to estimate the overall probability of a chunk.
-> It says the number of references to a chunk (usage count) is often collected for **garbage collection purpose**. This can make usage count be calculated easily.
-
-2. The redundant data found by the deduplication process is usually clustered together and forms larger sequences of redundant and unique chunks.
+---
+typora-copy-images-to: paper_figure
+---
+File Recipe Compression in Data Deduplication Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'13 | Metadata Compression |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The corresponding file recipe data can occupy a significant fraction of the total disk space.
+> if the deduplication ratio is very high
+
+This paper proposes a combination of efficient and scalable compression schemes to shrink the file recipes's size.
+> Chunk index: disk bottleneck
+> file recipes: contains a list of chunk identifiers.
+
+
+In a standard file system, the block pointer is stored in at most **64 bits**.
+> file recipes that store cryptographic fingerprints have a size of at least **20 bytes**
+
+### File Recipe Compression
+- File recipies may occupy a significant portion of the overall storage in a backup deduplication systems.
+> the file recipes grow linearly with the size of the stored logical data.
+
+- One way to reduce the file recipes's size is to use larger chunk sizes.
+> this decreases the deduplication ratio.
+
+- Key Idea:
+Assign a small code words to fingerprints.
+> the code word is then stored instead of the fingerprint in the file recipe.
+
+And this paper wants to investigate different approaches to select the code words.
+
+- Method 1: Zero-Chunk Suppression
+Observation: a few chunks are responsible for a high number of duplicates
+> Zero chunk: the chunk completely filled with zeros.
+> zero chunks are common in VM disk images.
+
+It is easy to detect zero chunks and replace them with a **special code word** by pre-calculating the fingerprint of the chunk filled with zeros.
+
+- Method 2: Chunk Index Page-oriented Approach
+Aims to assign a code word to all chunks
+> is not significantly longer than necessary to have a unique code word for each chunk in the system
+
+This approach uses chunk index's pages to assign code words.
+> code word consists of two parts: prefix and suffix. (page id | unique identifier in page)
+> The combination of suffix and prefix together uniquely identify the fingerprint.
+> prefix: used to identify the index on-disk page where the fingerprint is stored on.
+> suffix: in the on-disk page, it can search the fingerprint entry with the matching suffix.
+
+
+
+
+- Method 3: Statistical Dictionary Compression
+This method generalizes the zero chunk suppression.
+> 1. assign shorter code words to fingerprints based on the probabilies of the chunk usages.
+> 2. the usage of fingerprints is **highly skewed**, certain fingerprints are more likely than others.
+> 3. A **small set** of common fingerprints gets a short code word.
+
+Assign code words to chunks below a entropy threshold:
+> E.g., around 0.3% of the chunks
+> Need to maintain (in-memory) reverse index`
+
+The statistical model relies on a fingerprint's usage without looking at its context. (e.g., previous fingerprints in the data stream)
+> use fingerprint entropy to measure the length of code word.
+
+**Issue**
+the entropy of each chunk is not known in advance and can only be estimated after some data has been written.
+> only assign code words after one or more backups. (The first fingerprints have to be stored unmodified)
+> This paper assumes it is effectively possible to estimate the overall probability of a chunk. (garbage collection)
+
+
+- Method 4: Statistical Prediction Compression
+For each chunk
+1. Determine most-likely following fingerprint
+> Based on order-1 statistic
+> Data structure for order-1 too large at scale
+
+2. Select as prediction fingerprints
+3. Replace correct prediction by 1 byte code
+
+Using estimation of most-frequent following fingerprints
+> This paper mentions it uses the Misra-Gries Algorithms to approximate the detection of frequent fingerprint pairs.
+
+### Implementation and Evaluation
+Evaluation: Trace-based simulation of weekly full backup, three datasets
+> Compression Ratio: Pages-Based and Statistical Prediction
+> Combined compression ratio: four methods
+
+## 2. Strength (Contributions of the paper)
+1. This paper proposes some new compression approaches for file recipes. And the combination approaches can shrinks file recipes by more than $90\%$
+## 3. Weakness (Limitations of the paper)
+1. The writing of this paper is not very clear, I cannot fully understand the last two methods easily.
+2. In the statistical dictionary compression, it mentions its method is similar to Huffman coding. However, in order to implement this method to billions of chunks, it needs to relax to non-optimal.
+3. For non-backup systems, file recipe overheah and compression may not be important.
+## 4. Future Works
+1. In this paper, it mentions how to estimate the overall probability of a chunk.
+> It says the number of references to a chunk (usage count) is often collected for **garbage collection purpose**. This can make usage count be calculated easily.
+
+2. The redundant data found by the deduplication process is usually clustered together and forms larger sequences of redundant and unique chunks.
3. Some compression approaches in this paper can also be deployed to save the space of metadata in deduplication system.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Metadata-Management/MetadataHarmful-HotStorage'15.md b/StoragePaperNote/Deduplication/Metadata-Management/MetadataHarmful-HotStorage'15.md
old mode 100644
new mode 100755
index 5e7db76..4d4f97b
--- a/StoragePaperNote/Deduplication/Metadata-Management/MetadataHarmful-HotStorage'15.md
+++ b/StoragePaperNote/Deduplication/Metadata-Management/MetadataHarmful-HotStorage'15.md
@@ -1,82 +1,82 @@
----
-typora-copy-images-to: ../paper_figure
----
-Metadata Considered Harmful ... to Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| HotStorage'15 | Deduplication Metadata |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The effectiveness of deduplication can vary dramatically depending on the data stored.
-
-This paper shows that many file formats suffer from a fundamental design property that is **incompatible** with deduplication.
-> intersperse metadata with data in ways that result in otherwise identical data being different.
-
-Little work has been done to examine the impact of input data in deduplication.
-
-### Metadata meets deduplication
-Main idea: separate metadata from data
->1. design deduplication-friendly formats
->2. application-level post-processing
->3. format-aware deduplication
-
-- Metadata impacts deduplication
->1. Metadata changes: the input is an aggregate of many small user files, which is interleaved with file metadata. (including, file path, timestamps, etc.)
->
->
->By mixing more frequently changing metadata with data blocks, the *tar* format unnecessarily introduces many more unique chunks.
->
->
->
->2. Metadata location changes: the input is encoded in blocks and metadata is inserted for each block, data insertion/deletion lead to metadata shifts.
-
-
-- Deduplication-friendly formats
-1. Common data format
-separate data from metadata (EMC backup format)
-> the metadata of all files is grouped together and stored in one section.
-
-2. Application-level post-processing (Migratory tar)
-*tar* is a well-defined data format
-> unfriendly to deduplication
-> in wide use for decades, thus hard to change for compatibility reasons.
-> tar file is a sequence of entries
-> a entry: one header block + many data blocks
-
-**Migratory tar** (mtar)
-separate metadata from data blocks by colocating metadata blocks at the end of the mtar file.
-
-
-Store the offset of the metadata block in the first block of a mtar file for efficient access
-> a restore operation reads the first block, find the first header block
-> reads all data blocks for that file
-> repeat this process for every file
-
-
-
-### Implementation and Evaluation
-- Implementation:
-1. Modify the tar program to support convert $tar \Rightarrow mtar$
-2. use fs-hasher to support chunking for comparison
-
-
-- Evaluation
-dataset: the linux kernel distribution
-1. compare the deduplication ratio with different methods
-
-## 2. Strength (Contributions of the paper)
-1. The main contribution of this paper is it identifies and categorizes barriers to deduplication, and provides two cases study:
-> Industrial experience: EMC Data Domain
-> Academic research: GNU tar
-
-## 3. Weakness (Limitations of the paper)
-1. The idea of this paper is very simple, I only concern the restore performance of mtar
-## 4. Future Works
-1. This key insight in this paper is separating metadata from data can improve the deduplication ratio significantly.
-2. This paper also mentions it is very necessary to design a data format to be deduplication-friendly.
-> the application can improve deduplication in a **platform-independent** manner while isolating the storage system from the data format.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Metadata Considered Harmful ... to Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| HotStorage'15 | Deduplication Metadata |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The effectiveness of deduplication can vary dramatically depending on the data stored.
+
+This paper shows that many file formats suffer from a fundamental design property that is **incompatible** with deduplication.
+> intersperse metadata with data in ways that result in otherwise identical data being different.
+
+Little work has been done to examine the impact of input data in deduplication.
+
+### Metadata meets deduplication
+Main idea: separate metadata from data
+>1. design deduplication-friendly formats
+>2. application-level post-processing
+>3. format-aware deduplication
+
+- Metadata impacts deduplication
+>1. Metadata changes: the input is an aggregate of many small user files, which is interleaved with file metadata. (including, file path, timestamps, etc.)
+>
+>
+>By mixing more frequently changing metadata with data blocks, the *tar* format unnecessarily introduces many more unique chunks.
+>
+>
+>
+>2. Metadata location changes: the input is encoded in blocks and metadata is inserted for each block, data insertion/deletion lead to metadata shifts.
+
+
+- Deduplication-friendly formats
+1. Common data format
+separate data from metadata (EMC backup format)
+> the metadata of all files is grouped together and stored in one section.
+
+2. Application-level post-processing (Migratory tar)
+*tar* is a well-defined data format
+> unfriendly to deduplication
+> in wide use for decades, thus hard to change for compatibility reasons.
+> tar file is a sequence of entries
+> a entry: one header block + many data blocks
+
+**Migratory tar** (mtar)
+separate metadata from data blocks by colocating metadata blocks at the end of the mtar file.
+
+
+Store the offset of the metadata block in the first block of a mtar file for efficient access
+> a restore operation reads the first block, find the first header block
+> reads all data blocks for that file
+> repeat this process for every file
+
+
+
+### Implementation and Evaluation
+- Implementation:
+1. Modify the tar program to support convert $tar \Rightarrow mtar$
+2. use fs-hasher to support chunking for comparison
+
+
+- Evaluation
+dataset: the linux kernel distribution
+1. compare the deduplication ratio with different methods
+
+## 2. Strength (Contributions of the paper)
+1. The main contribution of this paper is it identifies and categorizes barriers to deduplication, and provides two cases study:
+> Industrial experience: EMC Data Domain
+> Academic research: GNU tar
+
+## 3. Weakness (Limitations of the paper)
+1. The idea of this paper is very simple, I only concern the restore performance of mtar
+## 4. Future Works
+1. This key insight in this paper is separating metadata from data can improve the deduplication ratio significantly.
+2. This paper also mentions it is very necessary to design a data format to be deduplication-friendly.
+> the application can improve deduplication in a **platform-independent** manner while isolating the storage system from the data format.
+
3. The tar format is unfriendly for deduplication, specifically, they found that metadata blocks have no deduplication, while data blocks show high deduplication ratios.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Metadata-Management/Metadedup.md b/StoragePaperNote/Deduplication/Metadata-Management/Metadedup.md
old mode 100644
new mode 100755
index b465aa1..805b668
--- a/StoragePaperNote/Deduplication/Metadata-Management/Metadedup.md
+++ b/StoragePaperNote/Deduplication/Metadata-Management/Metadedup.md
@@ -1,62 +1,62 @@
----
-typora-copy-images-to: paper_figure
----
-Metadedup: Deduplicating Metadata in Encrypted Deduplication via Indirection
-------------------------------------------
-@MSST'19 (under review) @deduplication (metadata)
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-This paper wants to reduce the storage overhead of **deduplication metadata** and **key metadata** of encrypted deduplication.
-> 1. deduplication metadata: fingerprint index of all chunks and file recipe (hold the mapping from chunks in the file to the references of the physical copies.)
-> 2. key metadata: chunk-to-key mappings to allow the decryption of individual files. (encrypted by a master key of file owners)
-
-### Metadedup
-- Metadedup applies deduplication to remove content redundancies of both data and metadata, so as to improve the overall storage efficiency. Three key goals
->1. reduce the storage overhead of metadata
->2. provide the security for both data and metadata
->3. introduce low overhead of the overall performance
-
-- Design
->1. Indirection: Metadedup collects the metadata of multiple regions of **adjacent encrypted data chunks** into *metadata chunk*.
->2. Segmentation:
-> 
-
-- Two kinds of operations
-
-**Write Operation**:
-> 1. divide the target file into a set of data chunks, and do encryption of them
-> 2. divde the set of encrypted chunks into a set of segments
-> 3. For each segment, add the metadata of encrypted chunks in this segment to an individual metadata chunk. **(each segment has a metadata chunk)**
-> 4. Add deduplication metadata of encrypted metadata chunk to recipes.
-> 5. After server received encrypted data chunks and encrypted metadata chunks, it executes the deduplication according to the fingerprint index of data chunks and metadata chunks.
-
-**Restore Operation**:
-Two-round interactions:
-> 1. Client sends the request to get metadata of the file. Server sends back the encrypted metadata chunks and encrypted key recipes
-> 2. Client decrypts the key recipe and metadata chunks, and requests the file data. Server sends back the encrypted data chunks to the client. And client decrypts each data chunk based on the corresponding key in metadata chunks.
-
-
-### Implementation and Evaluation
-This paper implements the Metadedup based on CDStore
-1. It adds a metadata chunk construction module in CDStore, which treats $s$ input share streams as encrypted data chunks.
-
-2. It implements the fingerprint index by using the key-value store LevelDB, which maps a fingerprint to an ID of a container which stores the corresponding data share or metadata chunk.
-
-
-Evaluation:
-1. Microbenchmarks test: consumed time and throughput in both write and restore operations
-2. prototype test: the loss performance with various segment sizes, data sizes
-3. trace-driven simulation: storage efficiency. (storage space it can save)
-
-## 2. Strength (Contributions of the paper)
-- This paper proves the high metadata storage overhead in encrypted deduplication system by using mathematical analysis and trace-driven simulation. This can support the motivation of this paper well.
-
-- It also considers the compatibility of Metadedup with some existing countermeasures, which can make its threat model more consolidated.
-## 3. Weakness (Limitations of the paper)
-- Compared with the case without metadata deduplication, it needs to retrieve from server twice in its restore operation.
-- From it experiments, it shows that handling metadata chunk consumes dominated time, it can become as the bottleneck when there are too many metadata chunks.
-## 4. Future Works
-- It can further optimize restore operation because this two round may take too much time. One thing that can be considered is how to merge those two steps as one step (metadata chunk + data chunk)
+---
+typora-copy-images-to: paper_figure
+---
+Metadedup: Deduplicating Metadata in Encrypted Deduplication via Indirection
+------------------------------------------
+@MSST'19 (under review) @deduplication (metadata)
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+This paper wants to reduce the storage overhead of **deduplication metadata** and **key metadata** of encrypted deduplication.
+> 1. deduplication metadata: fingerprint index of all chunks and file recipe (hold the mapping from chunks in the file to the references of the physical copies.)
+> 2. key metadata: chunk-to-key mappings to allow the decryption of individual files. (encrypted by a master key of file owners)
+
+### Metadedup
+- Metadedup applies deduplication to remove content redundancies of both data and metadata, so as to improve the overall storage efficiency. Three key goals
+>1. reduce the storage overhead of metadata
+>2. provide the security for both data and metadata
+>3. introduce low overhead of the overall performance
+
+- Design
+>1. Indirection: Metadedup collects the metadata of multiple regions of **adjacent encrypted data chunks** into *metadata chunk*.
+>2. Segmentation:
+> 
+
+- Two kinds of operations
+
+**Write Operation**:
+> 1. divide the target file into a set of data chunks, and do encryption of them
+> 2. divde the set of encrypted chunks into a set of segments
+> 3. For each segment, add the metadata of encrypted chunks in this segment to an individual metadata chunk. **(each segment has a metadata chunk)**
+> 4. Add deduplication metadata of encrypted metadata chunk to recipes.
+> 5. After server received encrypted data chunks and encrypted metadata chunks, it executes the deduplication according to the fingerprint index of data chunks and metadata chunks.
+
+**Restore Operation**:
+Two-round interactions:
+> 1. Client sends the request to get metadata of the file. Server sends back the encrypted metadata chunks and encrypted key recipes
+> 2. Client decrypts the key recipe and metadata chunks, and requests the file data. Server sends back the encrypted data chunks to the client. And client decrypts each data chunk based on the corresponding key in metadata chunks.
+
+
+### Implementation and Evaluation
+This paper implements the Metadedup based on CDStore
+1. It adds a metadata chunk construction module in CDStore, which treats $s$ input share streams as encrypted data chunks.
+
+2. It implements the fingerprint index by using the key-value store LevelDB, which maps a fingerprint to an ID of a container which stores the corresponding data share or metadata chunk.
+
+
+Evaluation:
+1. Microbenchmarks test: consumed time and throughput in both write and restore operations
+2. prototype test: the loss performance with various segment sizes, data sizes
+3. trace-driven simulation: storage efficiency. (storage space it can save)
+
+## 2. Strength (Contributions of the paper)
+- This paper proves the high metadata storage overhead in encrypted deduplication system by using mathematical analysis and trace-driven simulation. This can support the motivation of this paper well.
+
+- It also considers the compatibility of Metadedup with some existing countermeasures, which can make its threat model more consolidated.
+## 3. Weakness (Limitations of the paper)
+- Compared with the case without metadata deduplication, it needs to retrieve from server twice in its restore operation.
+- From it experiments, it shows that handling metadata chunk consumes dominated time, it can become as the bottleneck when there are too many metadata chunks.
+## 4. Future Works
+- It can further optimize restore operation because this two round may take too much time. One thing that can be considered is how to merge those two steps as one step (metadata chunk + data chunk)
- It is obvious that handling metadata chunk is the main bottleneck. However, given a target file, the amount of metadata chunks depends on the size of the segment. It can further consider how to mitigate this bottleneck by controling the size of the segment.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Metadata-Management/REED-DSN'16.md b/StoragePaperNote/Deduplication/Metadata-Management/REED-DSN'16.md
old mode 100644
new mode 100755
index d038923..6e5a2fd
--- a/StoragePaperNote/Deduplication/Metadata-Management/REED-DSN'16.md
+++ b/StoragePaperNote/Deduplication/Metadata-Management/REED-DSN'16.md
@@ -1,115 +1,115 @@
----
-typora-copy-images-to: ../paper_figure
----
-Rekeying for Encrypted Deduplication Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| DSN'16 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Replacing an existing key with a new key for encryption
-> it can renews security protection, so as to protect against key compromise and enable dynamic access control in cryptogrephic storage.
-
-This paper implements a rekeying-aware encrypted deduplication storage system.
-> to realize efficient rekeying
-> trade between performance and security, achieve dynamic access control
-
-Why realizing efficient rekeying in encrypted deduplication storage is challenging?
-> 1. if it renews the key by renewing the derivation function $\rightarrow$ any newly stored message encrypted by the new key can no longer be deduplicated with the existing identical message.
-> 2. if it re-encrypts all existing messages with the new key , then there will be tremendous performance overheads.
-
-**Key Question**: how to enable secure and lightweight rekeying, while preserving the deduplication capability?
-
-
-### REED
-- Main idea:
-REED encrypts a small part of the package with a key that is subject to rekeying, while the remaining large part of the package is generated from a deterministic variant of AONT.
-> preserve content similarity.
-> sacrificing a slight degradation of storage efficiency.
-
-Augments CAONT to enable rekeying
->1. generate a CAONT package with the **MLE key** as an input
->2. encrypt a small part of the package with the **file key** $\rightarrow$ stab
->3. Since the stab is very samll, the rekeying overhead can be mitigated.
-
-- System model
-In this work, it still keeps a dedicated key manager server
-> for key generation
-> resisting brute-force attack
-
-
-
-- Threat model
-1. An honest-but-curious adversary
-Aims to learn the content of the files in outsourced storage.
-> can compromise the cloud (any hosted server and the storage backend): all stored chunks and keys
-> can collude with a subset of unauthorized or revoked clients
-> can monitor the activities of the clients, identify the result returned by the key manager.
-
-2. the key manager is deployed in a fully protected zone
-> an adversary cannot compromise or gain access to the key manager.
-
-3. Server-side deduplication
-it does not introduce any side channel in deduplication.
-
-
-- Two rekeying-aware encryption schemes
-1. Basic encryption scheme
-> 1. Modify the cryptographic hash key in CANOT by the corresponding MLE key $K_M$ generated by the key manager.
-> 2. append a publicly known, fixed-size canary $c$ to $M$ for CANOT, so that the integrity of $M$ can be checked.
-
-2. Enhanced encryption scheme
-Goal: against the adversary to compromise the MLE key, the adversary can recover the pseudo-random mask.
-
-
-
-Need an additional encryption step.
-
-- Dynamic access control
-Associating each file with **policy**
-> ciphertext policy attribute-based encryption (CP-ABE): can be used to control the access privileges.
-> key regression
-> 
-
-### Implementation and Evaluation
-- Implementation
-C++: extend CDStore to support the rekeying
-Some points: batching, caching, parallelization
-> mitigate computational and I/O overhead
-
-- Evaluation setting
-One REED client, one key manager, and five REED servers
-> four of the five servers manages the key store
-
-Datasets:
-> **Synthetic data**: 2GB file of the synthetic data, load the synthetic data into memory to avoid generating any disk I/O overhead
-> **Real-world data**: FSL trace: 2013
-
-1. MLE key generation performance
-> fix the batch size as 256 per-chunk key generation requests. Around $17.64$MB/s
-> MLE key generation performance: start from sending the blinded fingerprints to the key manager
-
-
-2. Encryption performance
-> observe that the encryption speed is not the performance bottleneck in REED (network speed $1$Gb/s)
-
-3. Upload and download performance
-including the chunking, key generation, encryption, and data transfer.
-> the main bottleneck is the MLE key generation speed.
-> eight clients: $374.9$MB/s
-
-## 2. Strength (Contributions of the paper)
-1. This paper proposes two encryption schemes for REED, which trades between performance and security.
-> the enhanced scheme is resilient against key leakage through a more expensive encryption.
-
-
-## 3. Weakness (Limitations of the paper)
-
-
-## 4. Future Works
-1. In this paper, it mentions the design of a single key manager can be generalized for multiple key managers for improved availability.
-2. The idea of this paper is to offload the rekeying operation to the stub, by this way it can reduce the overhead of update, and preserve the deduplication for the trimmed package.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Rekeying for Encrypted Deduplication Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| DSN'16 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Replacing an existing key with a new key for encryption
+> it can renews security protection, so as to protect against key compromise and enable dynamic access control in cryptogrephic storage.
+
+This paper implements a rekeying-aware encrypted deduplication storage system.
+> to realize efficient rekeying
+> trade between performance and security, achieve dynamic access control
+
+Why realizing efficient rekeying in encrypted deduplication storage is challenging?
+> 1. if it renews the key by renewing the derivation function $\rightarrow$ any newly stored message encrypted by the new key can no longer be deduplicated with the existing identical message.
+> 2. if it re-encrypts all existing messages with the new key , then there will be tremendous performance overheads.
+
+**Key Question**: how to enable secure and lightweight rekeying, while preserving the deduplication capability?
+
+
+### REED
+- Main idea:
+REED encrypts a small part of the package with a key that is subject to rekeying, while the remaining large part of the package is generated from a deterministic variant of AONT.
+> preserve content similarity.
+> sacrificing a slight degradation of storage efficiency.
+
+Augments CAONT to enable rekeying
+>1. generate a CAONT package with the **MLE key** as an input
+>2. encrypt a small part of the package with the **file key** $\rightarrow$ stab
+>3. Since the stab is very samll, the rekeying overhead can be mitigated.
+
+- System model
+In this work, it still keeps a dedicated key manager server
+> for key generation
+> resisting brute-force attack
+
+
+
+- Threat model
+1. An honest-but-curious adversary
+Aims to learn the content of the files in outsourced storage.
+> can compromise the cloud (any hosted server and the storage backend): all stored chunks and keys
+> can collude with a subset of unauthorized or revoked clients
+> can monitor the activities of the clients, identify the result returned by the key manager.
+
+2. the key manager is deployed in a fully protected zone
+> an adversary cannot compromise or gain access to the key manager.
+
+3. Server-side deduplication
+it does not introduce any side channel in deduplication.
+
+
+- Two rekeying-aware encryption schemes
+1. Basic encryption scheme
+> 1. Modify the cryptographic hash key in CANOT by the corresponding MLE key $K_M$ generated by the key manager.
+> 2. append a publicly known, fixed-size canary $c$ to $M$ for CANOT, so that the integrity of $M$ can be checked.
+
+2. Enhanced encryption scheme
+Goal: against the adversary to compromise the MLE key, the adversary can recover the pseudo-random mask.
+
+
+
+Need an additional encryption step.
+
+- Dynamic access control
+Associating each file with **policy**
+> ciphertext policy attribute-based encryption (CP-ABE): can be used to control the access privileges.
+> key regression
+> 
+
+### Implementation and Evaluation
+- Implementation
+C++: extend CDStore to support the rekeying
+Some points: batching, caching, parallelization
+> mitigate computational and I/O overhead
+
+- Evaluation setting
+One REED client, one key manager, and five REED servers
+> four of the five servers manages the key store
+
+Datasets:
+> **Synthetic data**: 2GB file of the synthetic data, load the synthetic data into memory to avoid generating any disk I/O overhead
+> **Real-world data**: FSL trace: 2013
+
+1. MLE key generation performance
+> fix the batch size as 256 per-chunk key generation requests. Around $17.64$MB/s
+> MLE key generation performance: start from sending the blinded fingerprints to the key manager
+
+
+2. Encryption performance
+> observe that the encryption speed is not the performance bottleneck in REED (network speed $1$Gb/s)
+
+3. Upload and download performance
+including the chunking, key generation, encryption, and data transfer.
+> the main bottleneck is the MLE key generation speed.
+> eight clients: $374.9$MB/s
+
+## 2. Strength (Contributions of the paper)
+1. This paper proposes two encryption schemes for REED, which trades between performance and security.
+> the enhanced scheme is resilient against key leakage through a more expensive encryption.
+
+
+## 3. Weakness (Limitations of the paper)
+
+
+## 4. Future Works
+1. In this paper, it mentions the design of a single key manager can be generalized for multiple key managers for improved availability.
+2. The idea of this paper is to offload the rekeying operation to the stub, by this way it can reduce the overhead of update, and preserve the deduplication for the trimmed package.
+
diff --git a/StoragePaperNote/Deduplication/Post-Dedup/Finesse-FAST'19.md b/StoragePaperNote/Deduplication/Post-Dedup/Finesse-FAST'19.md
old mode 100644
new mode 100755
index 416cd9c..b1ebaba
--- a/StoragePaperNote/Deduplication/Post-Dedup/Finesse-FAST'19.md
+++ b/StoragePaperNote/Deduplication/Post-Dedup/Finesse-FAST'19.md
@@ -1,63 +1,63 @@
----
-typora-copy-images-to: paper_figure
----
-Finesse: Fine-Grained Feature Locality based Fast Resemblance Detection for Post-Deduplication Delta Compression
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| FAST'19 | Similarity Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The drawback in current deduplication:
-> it cannot remove redundant data among non-duplicate but very **similar** chunks
-
-To handle this issue, a way is to adopt delta compression whose main idea is exeucting **resemblance detection** to detect delta compression candidate, and just storing the difference (delta) and base chunk.
-
-**Issue**: Current similarity detection algorithm is based on *N-transform Super-Feature* which needs to extract a fixed number of features from a chunk by requiring N linear transformations for each fingerprint to generate **N-dimensional hash value sets**.
-> this process is time-consumed and compute-intensive because it requires too many linear transformations when generating **super features**
-
-### Finesse
-Based on a key observation of fine-grained locality among similar chunks (**feature locality**)
-> the corresponding subchunk of chunks and their features also appear in same order among the similar chunks with a very high probability.
-
-- Key Question
-How to fast detect high similarity chunk?
-
-- Basic Idea
-computing the similarity by first
-> 1. dividing each chunk into several subchunks (to get N features $\rightarrow$ N equal-sized subchunks)
-> 2. quickly computing features from each subchunk
-> 3. finally grouping these features into **SFs**.
-
-By this way, it can eliminate the linear transformations and thus simplify the feature computation.
-> Point of support: Most of the corresponding subchunk pairs in the detected similar chunks have the same features in six datasets.
-
-
-
-- Feature extraction
-divide the chunk into several sub-chunks with fixed size, get the maximum rabin fingerprint value.
-
-- Feature grouping
-Principle: features in an SF should be extracted from the subchunks distributed uniformly across the chunk.
-> Can be implemented simply by sorting
-
-### Implementation and Evaluation
-Implementation:
-implement delta compression in an open-source deduplication prototype system (Destor)
-deltra encoding $\rightarrow$ Xdelta
-
-Evaluation:
-1. Delta Compression Ratio (DCR): overall space saving
-2. Delta Compression Efficiency (DCE): focus on teh detected resembling chunks
-3. Similarity Computing Speed
-4. System Throughput
-
-## 2. Strength (Contributions of the paper)
-1. This paper tries to reduce the computation overhead of resemblance detection by using the feature loaclity in subchunks. It provides sufficient experiment observation to support its idea which makes it more concrete.
-## 3. Weakness (Limitations of the paper)
-1. One key limitation in this paper is it can just detect the similar chunks with same size. How to handle the case when two high similarity chunks with different sizes can be considerd.
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+Finesse: Fine-Grained Feature Locality based Fast Resemblance Detection for Post-Deduplication Delta Compression
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| FAST'19 | Similarity Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The drawback in current deduplication:
+> it cannot remove redundant data among non-duplicate but very **similar** chunks
+
+To handle this issue, a way is to adopt delta compression whose main idea is exeucting **resemblance detection** to detect delta compression candidate, and just storing the difference (delta) and base chunk.
+
+**Issue**: Current similarity detection algorithm is based on *N-transform Super-Feature* which needs to extract a fixed number of features from a chunk by requiring N linear transformations for each fingerprint to generate **N-dimensional hash value sets**.
+> this process is time-consumed and compute-intensive because it requires too many linear transformations when generating **super features**
+
+### Finesse
+Based on a key observation of fine-grained locality among similar chunks (**feature locality**)
+> the corresponding subchunk of chunks and their features also appear in same order among the similar chunks with a very high probability.
+
+- Key Question
+How to fast detect high similarity chunk?
+
+- Basic Idea
+computing the similarity by first
+> 1. dividing each chunk into several subchunks (to get N features $\rightarrow$ N equal-sized subchunks)
+> 2. quickly computing features from each subchunk
+> 3. finally grouping these features into **SFs**.
+
+By this way, it can eliminate the linear transformations and thus simplify the feature computation.
+> Point of support: Most of the corresponding subchunk pairs in the detected similar chunks have the same features in six datasets.
+
+
+
+- Feature extraction
+divide the chunk into several sub-chunks with fixed size, get the maximum rabin fingerprint value.
+
+- Feature grouping
+Principle: features in an SF should be extracted from the subchunks distributed uniformly across the chunk.
+> Can be implemented simply by sorting
+
+### Implementation and Evaluation
+Implementation:
+implement delta compression in an open-source deduplication prototype system (Destor)
+deltra encoding $\rightarrow$ Xdelta
+
+Evaluation:
+1. Delta Compression Ratio (DCR): overall space saving
+2. Delta Compression Efficiency (DCE): focus on teh detected resembling chunks
+3. Similarity Computing Speed
+4. System Throughput
+
+## 2. Strength (Contributions of the paper)
+1. This paper tries to reduce the computation overhead of resemblance detection by using the feature loaclity in subchunks. It provides sufficient experiment observation to support its idea which makes it more concrete.
+## 3. Weakness (Limitations of the paper)
+1. One key limitation in this paper is it can just detect the similar chunks with same size. How to handle the case when two high similarity chunks with different sizes can be considerd.
+
+## 4. Future Works
I think this paper is related to how to measure the chunk similarity between two given chunks. However, it still cannot quantify the similarity between two chunks which limits to further optimize the delta compression. One thing can be considered is to design a model to quantify the similarity between two given chunks.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Restore-Performance/ALACC-FAST'18.md b/StoragePaperNote/Deduplication/Restore-Performance/ALACC-FAST'18.md
old mode 100644
new mode 100755
index e6b29d7..dffb6d2
--- a/StoragePaperNote/Deduplication/Restore-Performance/ALACC-FAST'18.md
+++ b/StoragePaperNote/Deduplication/Restore-Performance/ALACC-FAST'18.md
@@ -1,99 +1,99 @@
----
-typora-copy-images-to: paper_figure
----
-ALACC: Accelerating Restore Performance of Data Deduplication Systems Using Adaptive Look-Ahead Window Assisted Chunk Caching
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| FAST'18 | containers deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-- This paper focuses on how to accelerate the restore operation in a data deduplication system.
-> Requesting a unique or duplicate data chunk may trigger a container read if the data chunk is not currently available in memory.
-> Due to the **serious data fragmentation** and **size mismatching of requested data and I/O unite**, the restore performance is much lower that that of directly reading out the data which is not deduplicated.
-
-- Existing method: chunk-based caching, container-based caching, and forward assembly.
-Remaining issues:
-> 1. how to handle the change of **workload locality**
-> 2. how to use the look-ahead window to improve the cache hit ratio
-> 3. how to make better trade-off between computing overhead and restore performance?
-
-| | Container-based Caching | Chunk-based Caching | Forward Assembly |
-| ------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------- |
-| Advantage | Less operating and management overhead | High cache hit ratio (can be improved by using look-ahead window) | Highly efficient, Low operating and management overhead |
-| Disadvantage | Relatively higher **cache miss ration**, especially when the caching space is limited. | Higher operating and management overhead | Workload sensitive, requires **good workload locality** |
-
-- Two main research directions in this issue:
-> 1. selecting and storing some duplicated data chunks during the deduplication processes
-> 2. design efficient caching policies during the restore process
-
-### Adaptive Look-Ahead Chunk Caching (ALACC)
-
-- Major goal: reduce the number of container-reads.
-> Forward assembly + chunk-based caching + LAW (limited memory space, look-ahead window)
-
-
-- Main idea:
-1. design a hybrid scheme which combines chunkl-based caching and forward assembly
-2. a new way to make better decision on which data chunks are to be cached or evicated.
-
-
-- Look-ahead window assisted chunk cache
-
-FAA: consists several container size memory buffers
-LAW: covers a range bigger than that of FAA in the recipe.
-> divide the FAW into two portions: FAA Covered Range (size is same as FAA) + Information for Chunk Cache (used for caching policy)
-
-Restore process:
-> 1. check the chunk in container read buffer
-> 2. check the chunk in chunk cache
-> 3. if that chunk is not in the chunk cache, read the container that holds the data chunk from the storage
-> 4. each chunk in this container is checked with LAW to identify all the locations it appears in the FAA.
-> 5. decide which chunks in this container are to be inserted to the chunk cache according to the **a caching policy**
-
-Caching policy:
-1. divide chunks in read-in container into three types:
-> 1. U-chunk (Unused chunk): not appear in the current entire LAW
-> 2. P-chunk (Probably used chunk): appears in the current FAA but does not appear in the second portion of the LAW
-> 3. F-chunk (Future used chunk): chunks will be used in the second portion of the LAW.
-
-2. Caching principle
-F-chunks should be cached, if more cache space is still available, it may cache some P-chunks. (also define the priority)
-
-- The adaptive algorithm
-It proposes an adaptive algorithm that can dynamically adjust the size of FAA, chunk cache and LAW according to the workload variation during the restore process.
-> For example, for backup trace: most data chunks are unique at the beginning. Later, in different sections of the work load, they may have various degrees of deduplication and re-usage.
-> Mainly consider three sizes: LAW size $\rightarrow$ $S_{LAW}$, FAA size $\rightarrow$ $S_{FAA}$, chunk cache size $\rightarrow$ $S_{cache}$. (it should require $S_{FAA} + S_{cache} \leq S_{LAW}$)
-> The size of LAW:
->
-> > Too large: the computing overhead large but the extra information in the LAW is wasting
-> > Too small: it becomes forward assembly + LRU cache.
-
-
-The key goal: to achieve best trade-off between cache efficiency and (CPU and memory) overhead.
-
-> dynamically adjusts the memory space ratio of FAA and chunk cache, and the size of LAW.
-> Instead of using a fixed value as the threshold.
-
-### Implementation and Evaluation
-- Prototype:
-> a deduplication system: 11k LoC C program
-> Restore recovery log (RRL) is maintained to ensure reliability.
-> Not open-source
-
-- Evaluation
-> a Dell PowerEdge server with 2.40GHz Intel Xeon 24 cores and 32GB
-> four deduplication traces: FSL_1, FSL_2, EMC_1, EMC_2
-> Measurement: speed factor (MB/container-read), computing cost factor (second/GB), restore throughput (MB/second).
-
-## 2. Strength (Contributions of the paper)
-1. this paper comprehensively analyzes the trade-off of different restore operation schemes in terms of the performance in various workloads and memory configurations.
-2. propose ALACC which can dynamically adjust the sizes of FAA and chunk cache to adapt to the changing of chunk locality.
-3. implement an effective look-ahead window with its size dynamically adjusted to provide essential information for FAA.
-## 3. Weakness (Limitations of the paper)
-1. I think ALACC can be still improved by multi-threading implementation. Also, this paper extends ALACC to handle the duplicated data chunk rewriting in FAST'19.
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+ALACC: Accelerating Restore Performance of Data Deduplication Systems Using Adaptive Look-Ahead Window Assisted Chunk Caching
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| FAST'18 | containers deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+- This paper focuses on how to accelerate the restore operation in a data deduplication system.
+> Requesting a unique or duplicate data chunk may trigger a container read if the data chunk is not currently available in memory.
+> Due to the **serious data fragmentation** and **size mismatching of requested data and I/O unite**, the restore performance is much lower that that of directly reading out the data which is not deduplicated.
+
+- Existing method: chunk-based caching, container-based caching, and forward assembly.
+Remaining issues:
+> 1. how to handle the change of **workload locality**
+> 2. how to use the look-ahead window to improve the cache hit ratio
+> 3. how to make better trade-off between computing overhead and restore performance?
+
+| | Container-based Caching | Chunk-based Caching | Forward Assembly |
+| ------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------- |
+| Advantage | Less operating and management overhead | High cache hit ratio (can be improved by using look-ahead window) | Highly efficient, Low operating and management overhead |
+| Disadvantage | Relatively higher **cache miss ration**, especially when the caching space is limited. | Higher operating and management overhead | Workload sensitive, requires **good workload locality** |
+
+- Two main research directions in this issue:
+> 1. selecting and storing some duplicated data chunks during the deduplication processes
+> 2. design efficient caching policies during the restore process
+
+### Adaptive Look-Ahead Chunk Caching (ALACC)
+
+- Major goal: reduce the number of container-reads.
+> Forward assembly + chunk-based caching + LAW (limited memory space, look-ahead window)
+
+
+- Main idea:
+1. design a hybrid scheme which combines chunkl-based caching and forward assembly
+2. a new way to make better decision on which data chunks are to be cached or evicated.
+
+
+- Look-ahead window assisted chunk cache
+
+FAA: consists several container size memory buffers
+LAW: covers a range bigger than that of FAA in the recipe.
+> divide the FAW into two portions: FAA Covered Range (size is same as FAA) + Information for Chunk Cache (used for caching policy)
+
+Restore process:
+> 1. check the chunk in container read buffer
+> 2. check the chunk in chunk cache
+> 3. if that chunk is not in the chunk cache, read the container that holds the data chunk from the storage
+> 4. each chunk in this container is checked with LAW to identify all the locations it appears in the FAA.
+> 5. decide which chunks in this container are to be inserted to the chunk cache according to the **a caching policy**
+
+Caching policy:
+1. divide chunks in read-in container into three types:
+> 1. U-chunk (Unused chunk): not appear in the current entire LAW
+> 2. P-chunk (Probably used chunk): appears in the current FAA but does not appear in the second portion of the LAW
+> 3. F-chunk (Future used chunk): chunks will be used in the second portion of the LAW.
+
+2. Caching principle
+F-chunks should be cached, if more cache space is still available, it may cache some P-chunks. (also define the priority)
+
+- The adaptive algorithm
+It proposes an adaptive algorithm that can dynamically adjust the size of FAA, chunk cache and LAW according to the workload variation during the restore process.
+> For example, for backup trace: most data chunks are unique at the beginning. Later, in different sections of the work load, they may have various degrees of deduplication and re-usage.
+> Mainly consider three sizes: LAW size $\rightarrow$ $S_{LAW}$, FAA size $\rightarrow$ $S_{FAA}$, chunk cache size $\rightarrow$ $S_{cache}$. (it should require $S_{FAA} + S_{cache} \leq S_{LAW}$)
+> The size of LAW:
+>
+> > Too large: the computing overhead large but the extra information in the LAW is wasting
+> > Too small: it becomes forward assembly + LRU cache.
+
+
+The key goal: to achieve best trade-off between cache efficiency and (CPU and memory) overhead.
+
+> dynamically adjusts the memory space ratio of FAA and chunk cache, and the size of LAW.
+> Instead of using a fixed value as the threshold.
+
+### Implementation and Evaluation
+- Prototype:
+> a deduplication system: 11k LoC C program
+> Restore recovery log (RRL) is maintained to ensure reliability.
+> Not open-source
+
+- Evaluation
+> a Dell PowerEdge server with 2.40GHz Intel Xeon 24 cores and 32GB
+> four deduplication traces: FSL_1, FSL_2, EMC_1, EMC_2
+> Measurement: speed factor (MB/container-read), computing cost factor (second/GB), restore throughput (MB/second).
+
+## 2. Strength (Contributions of the paper)
+1. this paper comprehensively analyzes the trade-off of different restore operation schemes in terms of the performance in various workloads and memory configurations.
+2. propose ALACC which can dynamically adjust the sizes of FAA and chunk cache to adapt to the changing of chunk locality.
+3. implement an effective look-ahead window with its size dynamically adjusted to provide essential information for FAA.
+## 3. Weakness (Limitations of the paper)
+1. I think ALACC can be still improved by multi-threading implementation. Also, this paper extends ALACC to handle the duplicated data chunk rewriting in FAST'19.
+
+## 4. Future Works
When this issue meets secure deduplication algorithm, can it arise other problem?
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Restore-Performance/ImproveRestore-FAST'13.md b/StoragePaperNote/Deduplication/Restore-Performance/ImproveRestore-FAST'13.md
old mode 100644
new mode 100755
index f050cbc..d3b512d
--- a/StoragePaperNote/Deduplication/Restore-Performance/ImproveRestore-FAST'13.md
+++ b/StoragePaperNote/Deduplication/Restore-Performance/ImproveRestore-FAST'13.md
@@ -1,105 +1,105 @@
----
-typora-copy-images-to: ../paper_figure
----
-Improving Restore Speed for Backup Systems that Use Inline Chunk-Based Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'13 | Deduplication Restore |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Restore speed in such systems often suffers due to **chunk fragmentation**.
-> Modern disks' relatively poor random I/O performance compared to sequential I/O, fragmentation greatly hurts restore performance.
-
-Due to chunk sharing between backups, it is not possible in general to find a chunk layout that reduces the fragmentation.
-> Rearranging chunks can also be expensive
-
-Chunk fragmentation and restore performance gets worse as time goes by, and affects the recent backups the most.
-> slowdowns of $4X$ over three months for one and $11X$ over two years for the other.
-
-This paper investigates
-1. how fragmentation and restore speed behave over time and under different cache size?
-2. two approaches to improve restore speed in these deduplication system.
-
-### Method Name
-- Background
-1. A backup (stream): a sequence of chunks generated by a chunking algorithm.
-2. Chunk container: store chunks, one container per incoming stream.
-3. Open container (4MB): the container that is used for storing new chunks from that stream.
-4. For RAID or recipe reference indirection, it is more efficient to blindly read the entire container even when we need only on chunk.
-
-
-- Simulation
-This paper does the simulation under a default simple n-slot LRU container cache
-> hold n chunk containers at a time and take $n\times$ container size space.
-> **Measurement factor**: the mean number of containers read per $MB$ of backup restored for the backups of a long term data set.
-> Proof: reading containers is the dominant restore cost.
-> 
-
-- Measuring restore speed
-This paper introduces a new restore speed proxy: **speed factor**.
-$\frac{1}{\text{mean containers read per MB of data stored}}$.
-This can estimate of absolute system performance via:
->1. raw I/O performance
->2. the container size
-
-- Container Capping
-Main idea: limit how many containers need to be read at restore time for each section of the backup.
-
-Capping trades off deduplication for faster restore speed
-> 1. In order to use fewer old containers, it has to give up deduplication
-> 2. instead of using a reference to an existing chunk copy in an old container, it will store a duplicate copy of that chunk in an open container and point to that copy.
-
-Step:
-1. Divide the backup stream into several segments (default 20MB)
-2. The capping requires an extra segment-sized buffer in RAM for each stream being ingested.
-3. Choose a threshold T based on the information about which containers contain which chunks of the segment.
-4. Rank order the containers by how many chunks of the segment they contain, breaking ties in favor of more recent containers, and choose the top T containers which contain the most chunks.
-5. Append any "new" chunks to the open containers.
-
-This process guarantees that the recipe for each segments refers to at most T old containers plus a little number of new containers containing "new" chunks.
-
-- Forward Assembly Area
-Main Idea: future knowledge of accessses can be exploited to improve both caching and prefetching.
-
-In deduplicated backup streams, two points different virtual memory paging:
->1. the effective unit of I/O is an entire chunk container (4MB), whereas the unit of the use is a much smaller variable-size chunk.
->2. at the time of starting the restore: it can have the prefect knowledge of the exact sequency of chunks that will be used thanks to the backup's recipe.
-
-Main step:
-Page in chunk containers to a single buffer but **cache chunks** rather than containers to avoid keeping around chunks that will never be used.
-> consult the next part of recipe to make better decisions about what chunks from the paged-in containers to retain.
-
-Goal: need load each container only **once** per range (M-byte slice) and do not keep around chunks that will not be needed during this range
-
-This method can also combine with ring buffer to further improve the efficiency of memory usage.
-
-
-### Implementation and Evaluation
-- Implementation
-Modified one of them deduplication simulators, and apply it two primary data sets under various caching strategies, and the effects of capping and container size.
->1. deduplication performance
->2. fragmentation
->3. restore performance
-
-Details: 9000 C++ program
-1. full chunk index: map the hashes of stored chunks to the chunk container
-2. chunk container size: 4MB
-3. mainly focus on deduplication ratio and speed factor
-
-## 2. Strength (Contributions of the paper)
-1. This paper shows that it is possible to give up a relatively small percentage of deduplication in practice and get quite substantial speedups.
-2. By using capping, it truly can reduce fragmentation.
-## 3. Weakness (Limitations of the paper)
-1. Given a workload, maybe it is hard to estimate the degree of trade-off between speedup factors and deduplication ratio.
-
-## 4. Future Works
-1. the idea of container capping is to achieve substantial speedups while giving up only a small amount of deduplication.
-2. This paper also mentions that for deduplication restore, it should use all available RAM for restoring a system for a single large forward assembly area and associated buffers.
-> Unless deduplication is at a great premium, at least a small amount of capping should be employed.
-
-3. Adaptiving capping
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Improving Restore Speed for Backup Systems that Use Inline Chunk-Based Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'13 | Deduplication Restore |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Restore speed in such systems often suffers due to **chunk fragmentation**.
+> Modern disks' relatively poor random I/O performance compared to sequential I/O, fragmentation greatly hurts restore performance.
+
+Due to chunk sharing between backups, it is not possible in general to find a chunk layout that reduces the fragmentation.
+> Rearranging chunks can also be expensive
+
+Chunk fragmentation and restore performance gets worse as time goes by, and affects the recent backups the most.
+> slowdowns of $4X$ over three months for one and $11X$ over two years for the other.
+
+This paper investigates
+1. how fragmentation and restore speed behave over time and under different cache size?
+2. two approaches to improve restore speed in these deduplication system.
+
+### Method Name
+- Background
+1. A backup (stream): a sequence of chunks generated by a chunking algorithm.
+2. Chunk container: store chunks, one container per incoming stream.
+3. Open container (4MB): the container that is used for storing new chunks from that stream.
+4. For RAID or recipe reference indirection, it is more efficient to blindly read the entire container even when we need only on chunk.
+
+
+- Simulation
+This paper does the simulation under a default simple n-slot LRU container cache
+> hold n chunk containers at a time and take $n\times$ container size space.
+> **Measurement factor**: the mean number of containers read per $MB$ of backup restored for the backups of a long term data set.
+> Proof: reading containers is the dominant restore cost.
+> 
+
+- Measuring restore speed
+This paper introduces a new restore speed proxy: **speed factor**.
+$\frac{1}{\text{mean containers read per MB of data stored}}$.
+This can estimate of absolute system performance via:
+>1. raw I/O performance
+>2. the container size
+
+- Container Capping
+Main idea: limit how many containers need to be read at restore time for each section of the backup.
+
+Capping trades off deduplication for faster restore speed
+> 1. In order to use fewer old containers, it has to give up deduplication
+> 2. instead of using a reference to an existing chunk copy in an old container, it will store a duplicate copy of that chunk in an open container and point to that copy.
+
+Step:
+1. Divide the backup stream into several segments (default 20MB)
+2. The capping requires an extra segment-sized buffer in RAM for each stream being ingested.
+3. Choose a threshold T based on the information about which containers contain which chunks of the segment.
+4. Rank order the containers by how many chunks of the segment they contain, breaking ties in favor of more recent containers, and choose the top T containers which contain the most chunks.
+5. Append any "new" chunks to the open containers.
+
+This process guarantees that the recipe for each segments refers to at most T old containers plus a little number of new containers containing "new" chunks.
+
+- Forward Assembly Area
+Main Idea: future knowledge of accessses can be exploited to improve both caching and prefetching.
+
+In deduplicated backup streams, two points different virtual memory paging:
+>1. the effective unit of I/O is an entire chunk container (4MB), whereas the unit of the use is a much smaller variable-size chunk.
+>2. at the time of starting the restore: it can have the prefect knowledge of the exact sequency of chunks that will be used thanks to the backup's recipe.
+
+Main step:
+Page in chunk containers to a single buffer but **cache chunks** rather than containers to avoid keeping around chunks that will never be used.
+> consult the next part of recipe to make better decisions about what chunks from the paged-in containers to retain.
+
+Goal: need load each container only **once** per range (M-byte slice) and do not keep around chunks that will not be needed during this range
+
+This method can also combine with ring buffer to further improve the efficiency of memory usage.
+
+
+### Implementation and Evaluation
+- Implementation
+Modified one of them deduplication simulators, and apply it two primary data sets under various caching strategies, and the effects of capping and container size.
+>1. deduplication performance
+>2. fragmentation
+>3. restore performance
+
+Details: 9000 C++ program
+1. full chunk index: map the hashes of stored chunks to the chunk container
+2. chunk container size: 4MB
+3. mainly focus on deduplication ratio and speed factor
+
+## 2. Strength (Contributions of the paper)
+1. This paper shows that it is possible to give up a relatively small percentage of deduplication in practice and get quite substantial speedups.
+2. By using capping, it truly can reduce fragmentation.
+## 3. Weakness (Limitations of the paper)
+1. Given a workload, maybe it is hard to estimate the degree of trade-off between speedup factors and deduplication ratio.
+
+## 4. Future Works
+1. the idea of container capping is to achieve substantial speedups while giving up only a small amount of deduplication.
+2. This paper also mentions that for deduplication restore, it should use all available RAM for restoring a system for a single large forward assembly area and associated buffers.
+> Unless deduplication is at a great premium, at least a small amount of capping should be employed.
+
+3. Adaptiving capping
+
4. Given a workload, how to measure the fragmentation level at different scale?
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Restore-Performance/LookBackWindow-FAST'19.md b/StoragePaperNote/Deduplication/Restore-Performance/LookBackWindow-FAST'19.md
old mode 100644
new mode 100755
index 6e77f5b..37cf6ba
--- a/StoragePaperNote/Deduplication/Restore-Performance/LookBackWindow-FAST'19.md
+++ b/StoragePaperNote/Deduplication/Restore-Performance/LookBackWindow-FAST'19.md
@@ -1,179 +1,179 @@
----
-typora-copy-images-to: ../paper_figure
----
-Sliding Look-Back Window Assisted Data Chunk Rewriting for Improving Deduplication Restore Performance
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'19 | Deduplication Restore |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-The data generated by deduplication is stored in data chunks or data containers, but the restore process is rather slow due to
-> data fragmentation: data chunks are scattered
-> read amplification: the size of data being read is larger than the size of data being restored
-
-This paper focuses on reducing the data fragmentation and read amplification of container-based deduplication system.
-> improve data chunk locality
-> make a better trade-off between the deduplication ratio and the number of required container reads (compared with capping-FAST'13)
-> reducing the number of container reads is a major task for restore performance improvement.
-
-
-- Container-based deduplication system
-Since the size of a data chunk is rather small
-> storing individual data chunk directly cannot efficiently utilize the storage bandwidth for storage systems with low random read/write performance. (HDD)
-> typically accumulates a number of data chunks in a *container* before writing them out together.
-
-
-- The limitations of capping (FAST'13)
- - analyzes each segment in isolation without considering the contents of the previous or following segments.
- - the deduplication ration cannot be guaranteed via capping (not consider deduplication ratio as an optimization objective in capping scheme)
- - apply the same cap to each segment
-
-
-- Main challenge
-it is difficult to make decisions on which duplicate chunks need to be rewritten with the minimum reduction of the deduplication ratio.
-
-### Sliding Look-back window
-- Data rewrite
-The decision to rewrite a duplicate data chunk has to be made during the deduplication process instead of being done at restore process like **caching**.
-> avoiding the need to read these duplicate chunks from other old containers.
-
-- Flexible container referenced count based design (FCRC)
-
-1. Main idea: apply varied capping levels for different segments
-> further reduce container reads
-> achieve even fewer data chunk rewrites
-
-2. how to decide the "capping level" for different segments?
-Instead of using a fixed capping level as the selection threshold, it uses a value of CNRC (*the number of data chunks in the segment that belongs to an old container*).
-> rewrite duplicate data chunks from old containers that have CNRCs lower than the threshold.
-
-The actual capping level is decided by
-> the threshold $T_{cnrc}$
-> the distribution of CNRCs of these segments
-
-Two bounds for $T_{cnrc}$
-1. bound for deduplication ratio reduction
-a targeted deduplication ratio reduction limit $x%$
-> bound the number of rewrite chunks
-
-
-2. bound for container reads:
-a targeted number of container reads $Cap$ in one segment
-
-Use those two bounds to determine the $T_{cnrc}$.
-
-- Sliding look-back window (LWB)
-In both the capping and FCRC schemes, the decision to rewrite duplicate data chunks near the segment partition boundaries may have issues.
-> existing wasted rewrite chunks (since restore cache)
-
-The rewrite decisions made with the statistics only from the current segment are less accurate.
-
-
-The LBW acts as a recipe cache that maintains the metadata entries of data chunks in the order covered by the LBW in the byte stream.
-> metadata entry:
-> 1. chunk metadata
-> 2. offset in the byte stream
-> 3. container ID/address
-> 4. the offset in the container
-
-With both **past** and **future** information in the LBW, a more accurate decision can be made.
-
-- Rewrite selection policy for LWB
- - use **two** criteria of restore process to adjust threshold and make more accurate rewrite decisions.
- - cache-effective range: how to maintain LBW with the size compatible with the cache-effective range
- - container read efficiency
-
-The whole process of rewrite selection:
-1. Step 1 (move LBW)
-When LBW moves forward for one container size
-> a container size of data chunks (added container) will be added to the front of the LBW
-> one container size of data chunks (evicted container) will be removed from the end of the LBW
-
-2. Step 2 (process the added container)
-classify the data chunks into three categories:
-> unique chunks
-> non-rewrite chunks (duplicate data chunks that will not be rewritten)
-> candidate chunks (duplicate data chunks that may be rewritten)
-
-3. Step 3 (update metadata entries)
-update the metadata entries of data chunks in the added container
-> identify candidate chunks and write them to rewrite candidate cache
-
-4. Step 4 (recalculate $CNRC_{lbw}$)
-recalculate the $CNRC_{lbw}$ of old containers that contain the data chunks in the rewrite candidate cache
-> reclassify these data chunks
-
-5. Step 5 (rewrite)
-rewrite the remaining candidate chunks and update metadata entries, write to the recipe persistently
-
-6. Step 6 (adjust)
-To make better trade-off between deduplication ratio reduction and the number of container reads
-> adjust at the end of each cycle
-
-
-### Implementation and Evaluation
-- Evaluation
-Speed factor: MB/container-read, the mean size data being restored (MB) per container read to indicate the amount of data that can be restored by one container read on average.
-> the container I/O time dominates the whole restore time (in HDD)
-
-Trace: FSL
-select six types of traces in FSL
-
-> each trace contains 10 full backup snapshots
-
-1. Baseline method
-For rewrite:
-> normal deduplication with no rewrite
-> capping scheme (Capping)
-> flexible container referenced count based scheme (FCRC)
-> sliding look-back windows scheme (LBW)
-
-For restore cache:
-> forward assembly area (FAA)
-> adaptive look-ahead window chunk based caching (ALACC)
-
-
-2. Metrics
-> Deduplication ratio vs. Speed factor
-> Restore performance
-
-## 2. Strength (Contributions of the paper)
-- rewrite scheme
- - a flexible container-based referenced count based rewrite scheme
- - sliding look-back window based design
-
-- Good experiment
- - compare with many state-of-art schemes
-
-
-## 3. Weakness (Limitations of the paper)
-- Does not show a complete system with proposed scheme
- - the real performance of LBW is not clear
-
-
-
-## 4. Future Works
-1. Use container or not use container?
-- not use container
-This paper mentions that some deduplication systems *directly store individual data chunks to the persistent storage*.
-> HYDRAstor, iDedup, Dmdedup and ZFS
-
-- use container
-some backup products pack a number of data chunks (compression may be applied) in one I/O unit, and data in the same container are written out and read in together.
-> Veritas and Data Domain
-> benefit from the high sequential I/O performance and good chunk locality.
-
-
-2. Adaptive to different workload characteristics
-One start-point of this paper is the performance of previous schemes is closely related to the workload characteristics.
-> Insight: the motivation to propose an adaptive method
-
-3. Future work
-how to design a scheme to adaptively change LBW size
-> more interlligent rewrite policies
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Sliding Look-Back Window Assisted Data Chunk Rewriting for Improving Deduplication Restore Performance
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'19 | Deduplication Restore |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+The data generated by deduplication is stored in data chunks or data containers, but the restore process is rather slow due to
+> data fragmentation: data chunks are scattered
+> read amplification: the size of data being read is larger than the size of data being restored
+
+This paper focuses on reducing the data fragmentation and read amplification of container-based deduplication system.
+> improve data chunk locality
+> make a better trade-off between the deduplication ratio and the number of required container reads (compared with capping-FAST'13)
+> reducing the number of container reads is a major task for restore performance improvement.
+
+
+- Container-based deduplication system
+Since the size of a data chunk is rather small
+> storing individual data chunk directly cannot efficiently utilize the storage bandwidth for storage systems with low random read/write performance. (HDD)
+> typically accumulates a number of data chunks in a *container* before writing them out together.
+
+
+- The limitations of capping (FAST'13)
+ - analyzes each segment in isolation without considering the contents of the previous or following segments.
+ - the deduplication ration cannot be guaranteed via capping (not consider deduplication ratio as an optimization objective in capping scheme)
+ - apply the same cap to each segment
+
+
+- Main challenge
+it is difficult to make decisions on which duplicate chunks need to be rewritten with the minimum reduction of the deduplication ratio.
+
+### Sliding Look-back window
+- Data rewrite
+The decision to rewrite a duplicate data chunk has to be made during the deduplication process instead of being done at restore process like **caching**.
+> avoiding the need to read these duplicate chunks from other old containers.
+
+- Flexible container referenced count based design (FCRC)
+
+1. Main idea: apply varied capping levels for different segments
+> further reduce container reads
+> achieve even fewer data chunk rewrites
+
+2. how to decide the "capping level" for different segments?
+Instead of using a fixed capping level as the selection threshold, it uses a value of CNRC (*the number of data chunks in the segment that belongs to an old container*).
+> rewrite duplicate data chunks from old containers that have CNRCs lower than the threshold.
+
+The actual capping level is decided by
+> the threshold $T_{cnrc}$
+> the distribution of CNRCs of these segments
+
+Two bounds for $T_{cnrc}$
+1. bound for deduplication ratio reduction
+a targeted deduplication ratio reduction limit $x%$
+> bound the number of rewrite chunks
+
+
+2. bound for container reads:
+a targeted number of container reads $Cap$ in one segment
+
+Use those two bounds to determine the $T_{cnrc}$.
+
+- Sliding look-back window (LWB)
+In both the capping and FCRC schemes, the decision to rewrite duplicate data chunks near the segment partition boundaries may have issues.
+> existing wasted rewrite chunks (since restore cache)
+
+The rewrite decisions made with the statistics only from the current segment are less accurate.
+
+
+The LBW acts as a recipe cache that maintains the metadata entries of data chunks in the order covered by the LBW in the byte stream.
+> metadata entry:
+> 1. chunk metadata
+> 2. offset in the byte stream
+> 3. container ID/address
+> 4. the offset in the container
+
+With both **past** and **future** information in the LBW, a more accurate decision can be made.
+
+- Rewrite selection policy for LWB
+ - use **two** criteria of restore process to adjust threshold and make more accurate rewrite decisions.
+ - cache-effective range: how to maintain LBW with the size compatible with the cache-effective range
+ - container read efficiency
+
+The whole process of rewrite selection:
+1. Step 1 (move LBW)
+When LBW moves forward for one container size
+> a container size of data chunks (added container) will be added to the front of the LBW
+> one container size of data chunks (evicted container) will be removed from the end of the LBW
+
+2. Step 2 (process the added container)
+classify the data chunks into three categories:
+> unique chunks
+> non-rewrite chunks (duplicate data chunks that will not be rewritten)
+> candidate chunks (duplicate data chunks that may be rewritten)
+
+3. Step 3 (update metadata entries)
+update the metadata entries of data chunks in the added container
+> identify candidate chunks and write them to rewrite candidate cache
+
+4. Step 4 (recalculate $CNRC_{lbw}$)
+recalculate the $CNRC_{lbw}$ of old containers that contain the data chunks in the rewrite candidate cache
+> reclassify these data chunks
+
+5. Step 5 (rewrite)
+rewrite the remaining candidate chunks and update metadata entries, write to the recipe persistently
+
+6. Step 6 (adjust)
+To make better trade-off between deduplication ratio reduction and the number of container reads
+> adjust at the end of each cycle
+
+
+### Implementation and Evaluation
+- Evaluation
+Speed factor: MB/container-read, the mean size data being restored (MB) per container read to indicate the amount of data that can be restored by one container read on average.
+> the container I/O time dominates the whole restore time (in HDD)
+
+Trace: FSL
+select six types of traces in FSL
+
+> each trace contains 10 full backup snapshots
+
+1. Baseline method
+For rewrite:
+> normal deduplication with no rewrite
+> capping scheme (Capping)
+> flexible container referenced count based scheme (FCRC)
+> sliding look-back windows scheme (LBW)
+
+For restore cache:
+> forward assembly area (FAA)
+> adaptive look-ahead window chunk based caching (ALACC)
+
+
+2. Metrics
+> Deduplication ratio vs. Speed factor
+> Restore performance
+
+## 2. Strength (Contributions of the paper)
+- rewrite scheme
+ - a flexible container-based referenced count based rewrite scheme
+ - sliding look-back window based design
+
+- Good experiment
+ - compare with many state-of-art schemes
+
+
+## 3. Weakness (Limitations of the paper)
+- Does not show a complete system with proposed scheme
+ - the real performance of LBW is not clear
+
+
+
+## 4. Future Works
+1. Use container or not use container?
+- not use container
+This paper mentions that some deduplication systems *directly store individual data chunks to the persistent storage*.
+> HYDRAstor, iDedup, Dmdedup and ZFS
+
+- use container
+some backup products pack a number of data chunks (compression may be applied) in one I/O unit, and data in the same container are written out and read in together.
+> Veritas and Data Domain
+> benefit from the high sequential I/O performance and good chunk locality.
+
+
+2. Adaptive to different workload characteristics
+One start-point of this paper is the performance of previous schemes is closely related to the workload characteristics.
+> Insight: the motivation to propose an adaptive method
+
+3. Future work
+how to design a scheme to adaptively change LBW size
+> more interlligent rewrite policies
+
how to combine garbage collection with the rewrite design
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Restore-Performance/RevDedup-APSys'13.md b/StoragePaperNote/Deduplication/Restore-Performance/RevDedup-APSys'13.md
old mode 100644
new mode 100755
index 036bf2e..7e81a51
--- a/StoragePaperNote/Deduplication/Restore-Performance/RevDedup-APSys'13.md
+++ b/StoragePaperNote/Deduplication/Restore-Performance/RevDedup-APSys'13.md
@@ -1,95 +1,95 @@
----
-typora-copy-images-to: ../paper_figure
----
-RevDedup: A Reverse Deduplication Storage System Optimized for Reads to Latest Backups
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ACM APSys'13 | Deduplication Restore |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Deduplication introduces fragmentation that degrades read performance. This paper proposes RevDedup which can optimize reads to the **latest** backups of VM images via reverse deduplication.
-> Idea: shift fragmentation to old data while keeping the layout of new data as sequential as possible.
-
-Some assumptions
-1. Today's backup solutions mainly build on disk-based storage.
-2. A fast restore operation can minimize the system downtime during disaster recovery.
-3. Users are more likely to access more recent data.
-
-### RevDedup
-
-- Key insight
-Traditional deduplication systems check if new blocks can be represented by any already stored blocks with identical contents.
-> The fragmentation problem of the latest backup is the most severe since its blocks are scattered across all the prior backups.
-
-To mitigate this impact, this paper proposes to do the **opposite** deduplication.
-> check if any already stored blocks can be represented by the new blocks to be written.
-> keep the storage layout of the newer backups as **sequential** as possible.
-
-
-- System Design
-1. Client-sever model:
-> 1. a server stores deduplicated VM images and the deduplication metadata
-> 2. multiple clients run the active VMs
-
-2. Fix-size chunking
-> 1. have smaller chunking overhead than variable-size chunking.
-
-3. Coarse-grained global deduplication
-> 1. apply deduplication to large fixed-size units called segmemts, (several megabytes)
-> 2. inside single versions VM $\rightarrow$ across different versions VM$\rightarrow$ across different versions VM in different machines.
-> 3. large segment size can amortize disk seek while still achieves high deduplication efficiency.
-
-- Fine-Grained Reverse Deduplication
-1. Remove the any duplicates in older versions, and use indirection.
-
-2. Only compare the two most recent versions $VM_i$ and $VM_{i-1}$ (may loss some saving space)
-3. Reference Counting:
-indicates the number of direct references that currently refer to the block among all versions of the same VM or different VMs.
-
-
-4. Some optimized methods
-Block punching: hole-punched region, and implement via **fallocate()** in file system.
-Segment compaction: compact a segment that excludes the removed blocks.
-> uses a pre-defined threshold (namely, **rebuild threshold**) to determine how to determine how to rebuild a segment excluding removed blocks.
-> rebuild threshold is configured to trade between **disk fragmentation** and **segment copying time**.
-
-
-5. Indexing
-Segment index is in-memory (fingerprint and other metadata ---- each segment)
-> argue that the index has low memory usage when using large-size segments.
-> SHA-1 for block fingerprints and segment fingerprints
-
-The metadata of each block in each segment is stored in disk.
-> fingerprints and reference counts of each block
-
-
-
-### Implementation and Evaluation
-- Imlementation
-Offload the server by having the clients be responsible for both segment and block fingerprint computations.
-> Multi-threading
-> Communication: RESTful APIs
-
-
-
-- Evaluation
-Prove: high deduplication efficiency, high backup throughput, high read throughput.
-Dataset: local VM datasets (160 VM images)
-1. Storage efficiency
-Similar to conventional deduplication ratio (96%)
-2. Backup throughput
-Conventional deduplication backup throughput (30% loss)
-3. Read throughput
-achieve the design goal, improve the restore performance of latest backup.
-
-## 2. Strength (Contributions of the paper)
-1. This paper is very clear, and its idea is very simple but novel, and can improve the restore performance for latest backup.
-2. The system design is very comprehansive.
-## 3. Weakness (Limitations of the paper)
-1. This paper just focuses on latest backups which is not very general.
-## 4. Future Works
-1. This paper just assumes users are more likely to access more recent data, if we weak this assumption to a general case, how to change this method to improve the shift the fragmentation evenly?
+---
+typora-copy-images-to: ../paper_figure
+---
+RevDedup: A Reverse Deduplication Storage System Optimized for Reads to Latest Backups
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ACM APSys'13 | Deduplication Restore |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Deduplication introduces fragmentation that degrades read performance. This paper proposes RevDedup which can optimize reads to the **latest** backups of VM images via reverse deduplication.
+> Idea: shift fragmentation to old data while keeping the layout of new data as sequential as possible.
+
+Some assumptions
+1. Today's backup solutions mainly build on disk-based storage.
+2. A fast restore operation can minimize the system downtime during disaster recovery.
+3. Users are more likely to access more recent data.
+
+### RevDedup
+
+- Key insight
+Traditional deduplication systems check if new blocks can be represented by any already stored blocks with identical contents.
+> The fragmentation problem of the latest backup is the most severe since its blocks are scattered across all the prior backups.
+
+To mitigate this impact, this paper proposes to do the **opposite** deduplication.
+> check if any already stored blocks can be represented by the new blocks to be written.
+> keep the storage layout of the newer backups as **sequential** as possible.
+
+
+- System Design
+1. Client-sever model:
+> 1. a server stores deduplicated VM images and the deduplication metadata
+> 2. multiple clients run the active VMs
+
+2. Fix-size chunking
+> 1. have smaller chunking overhead than variable-size chunking.
+
+3. Coarse-grained global deduplication
+> 1. apply deduplication to large fixed-size units called segmemts, (several megabytes)
+> 2. inside single versions VM $\rightarrow$ across different versions VM$\rightarrow$ across different versions VM in different machines.
+> 3. large segment size can amortize disk seek while still achieves high deduplication efficiency.
+
+- Fine-Grained Reverse Deduplication
+1. Remove the any duplicates in older versions, and use indirection.
+
+2. Only compare the two most recent versions $VM_i$ and $VM_{i-1}$ (may loss some saving space)
+3. Reference Counting:
+indicates the number of direct references that currently refer to the block among all versions of the same VM or different VMs.
+
+
+4. Some optimized methods
+Block punching: hole-punched region, and implement via **fallocate()** in file system.
+Segment compaction: compact a segment that excludes the removed blocks.
+> uses a pre-defined threshold (namely, **rebuild threshold**) to determine how to determine how to rebuild a segment excluding removed blocks.
+> rebuild threshold is configured to trade between **disk fragmentation** and **segment copying time**.
+
+
+5. Indexing
+Segment index is in-memory (fingerprint and other metadata ---- each segment)
+> argue that the index has low memory usage when using large-size segments.
+> SHA-1 for block fingerprints and segment fingerprints
+
+The metadata of each block in each segment is stored in disk.
+> fingerprints and reference counts of each block
+
+
+
+### Implementation and Evaluation
+- Imlementation
+Offload the server by having the clients be responsible for both segment and block fingerprint computations.
+> Multi-threading
+> Communication: RESTful APIs
+
+
+
+- Evaluation
+Prove: high deduplication efficiency, high backup throughput, high read throughput.
+Dataset: local VM datasets (160 VM images)
+1. Storage efficiency
+Similar to conventional deduplication ratio (96%)
+2. Backup throughput
+Conventional deduplication backup throughput (30% loss)
+3. Read throughput
+achieve the design goal, improve the restore performance of latest backup.
+
+## 2. Strength (Contributions of the paper)
+1. This paper is very clear, and its idea is very simple but novel, and can improve the restore performance for latest backup.
+2. The system design is very comprehansive.
+## 3. Weakness (Limitations of the paper)
+1. This paper just focuses on latest backups which is not very general.
+## 4. Future Works
+1. This paper just assumes users are more likely to access more recent data, if we weak this assumption to a general case, how to change this method to improve the shift the fragmentation evenly?
2. Also, how about the variable-size chunking?
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/BloomFilterDedup-ICSP'19.md b/StoragePaperNote/Deduplication/Secure-Dedup/BloomFilterDedup-ICSP'19.md
old mode 100644
new mode 100755
index 5db7c91..797fd3c
--- a/StoragePaperNote/Deduplication/Secure-Dedup/BloomFilterDedup-ICSP'19.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/BloomFilterDedup-ICSP'19.md
@@ -1,75 +1,75 @@
----
-typora-copy-images-to: ../paper_figure
----
-Bloom Filter Based Privacy Preserving Deduplication System
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| Spinger ICSP'19 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Deduplication can be used as a side channel to learn the existence of a particular data in cloud storage .
-> the existing solutions delay deduplication process to hide information regarding the presence of data.
-> increase the communication overhead, even if the data is present on storage.
-
-This paper wants to solve this problem via using a bloom filter.
-> a client sends an upload request, the server responds by sending a genuine bloom filter (if data exists) along with some dummy filters.
-
-### Bloom Filter privacy perserving deduplication approach
-- In this approach, it uses bloom filter as the identity of the file
-BF is stored at servers as the identity of file,
-> 1. the client who possesses the file, can generate the BF using file blocks and request for access to the file using BF.
-> 2. the client sends tag of file, then, the server responds with set of BFs and encryption keys coresponding to the tag.
-
-- Threat Model
-1. Semi-trusted Server
-Honestly performs the storage operations requested by clients, but is interested in learning plaintext of the outsourced files. (dictionary attacks)
-2. Malicious Client
-misuse deduplication process to learn the presence of file at storage.
-
-
-- Security Goal
-1. Privacy
-An adversary should not be able to learn any information regarding the file.
-> the plaintext information, the presence of file.
-> Server responds with a set of BFs instead of "Yes/No" as a response.
-
-An adversary cannot learn the existence information without knowledge of the complete file.
-
-2. Integrity
-link hash value with stored/sent content.
-
-
-- Proposed Approach
-
-
-- Eavesdropping
-To protect the communication from client to server, it leverages random nonce phenomena ($R$)
-> sends $BF \oplus R$ instead of $BF$
-
-
-### Implementation and Evaluation
-- Evaluation
-1. Upload time
-2. Communication cost
-
-
-## 2. Strength (Contributions of the paper)
-1. This paper uses a bloom filter to achieve privacy preserving deduplication and also provide the security analysis to prove that it can make sense.
-> also has the implementation to show the performance.
-
-
-## 3. Weakness (Limitations of the paper)
-1. This method does not totally solve this issue, since if a malicious user owns a file, it can test whether other people own this file as well.
-> still exists the information leakage.
-
-
-## 4. Future Works
-1. In this paper, it declears that it only considers the client side, file level, inter user deduplication. (When we consider a problem, we should state the scenario condition clearly.)
->1. client side deduplication or server side deduplication
->2. file level or chunk level
->3. inter user deduplication or intra user deduplication
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Bloom Filter Based Privacy Preserving Deduplication System
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| Spinger ICSP'19 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Deduplication can be used as a side channel to learn the existence of a particular data in cloud storage .
+> the existing solutions delay deduplication process to hide information regarding the presence of data.
+> increase the communication overhead, even if the data is present on storage.
+
+This paper wants to solve this problem via using a bloom filter.
+> a client sends an upload request, the server responds by sending a genuine bloom filter (if data exists) along with some dummy filters.
+
+### Bloom Filter privacy perserving deduplication approach
+- In this approach, it uses bloom filter as the identity of the file
+BF is stored at servers as the identity of file,
+> 1. the client who possesses the file, can generate the BF using file blocks and request for access to the file using BF.
+> 2. the client sends tag of file, then, the server responds with set of BFs and encryption keys coresponding to the tag.
+
+- Threat Model
+1. Semi-trusted Server
+Honestly performs the storage operations requested by clients, but is interested in learning plaintext of the outsourced files. (dictionary attacks)
+2. Malicious Client
+misuse deduplication process to learn the presence of file at storage.
+
+
+- Security Goal
+1. Privacy
+An adversary should not be able to learn any information regarding the file.
+> the plaintext information, the presence of file.
+> Server responds with a set of BFs instead of "Yes/No" as a response.
+
+An adversary cannot learn the existence information without knowledge of the complete file.
+
+2. Integrity
+link hash value with stored/sent content.
+
+
+- Proposed Approach
+
+
+- Eavesdropping
+To protect the communication from client to server, it leverages random nonce phenomena ($R$)
+> sends $BF \oplus R$ instead of $BF$
+
+
+### Implementation and Evaluation
+- Evaluation
+1. Upload time
+2. Communication cost
+
+
+## 2. Strength (Contributions of the paper)
+1. This paper uses a bloom filter to achieve privacy preserving deduplication and also provide the security analysis to prove that it can make sense.
+> also has the implementation to show the performance.
+
+
+## 3. Weakness (Limitations of the paper)
+1. This method does not totally solve this issue, since if a malicious user owns a file, it can test whether other people own this file as well.
+> still exists the information leakage.
+
+
+## 4. Future Works
+1. In this paper, it declears that it only considers the client side, file level, inter user deduplication. (When we consider a problem, we should state the scenario condition clearly.)
+>1. client side deduplication or server side deduplication
+>2. file level or chunk level
+>3. inter user deduplication or intra user deduplication
+
2. The key idea of this paper is to use the bloom filter as the identity of file. By this way, the server doesn't need to response "Yes/No" directly.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/CAONT-RS-HotStorage'14.md b/StoragePaperNote/Deduplication/Secure-Dedup/CAONT-RS-HotStorage'14.md
old mode 100644
new mode 100755
index 8ebcdab..9e697c9
--- a/StoragePaperNote/Deduplication/Secure-Dedup/CAONT-RS-HotStorage'14.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/CAONT-RS-HotStorage'14.md
@@ -1,63 +1,63 @@
----
-typora-copy-images-to: paper_figure
----
-Convergent Dispersal: Toward Storage-Efficient Security in a Cloud-of Clouds
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| HotStorage'14 | dispersal, deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The keyless security of existing dispersal algorithms relies on embedded random information which breaks data deduplication of the dispersed data. This paper proposes convergent dispersal, which replaces the original random information with deterministic cryptographic hash which is derived from the original data.
-
-The Summary of existing dispersal algorithm
-
-
-
-### Convergent Dispersal
-This paper contains two convergent dispersal algorithms
-> CRSSS and CAONT-RS
-> both of them augment existing dispersal algorithms with the deduplication property.
-
-- Target Deployment Scenario
-Each compute server performs the cross-user deduplication.
-
-The security of existing dispersal algorithms depends on the embedded **random information**. Due to randomness, distinct secrets with identical content lead to different sets of shares (impede data deduplication)
-
-- Key idea
- replace the original random information in existing dispersal algorithms with **deterministic hashes** generated from the secret.
-- Convergent RSSS (CRSSS)
-
-Given a secret, it splits it into a number of words which has the **same size** as hashes (e.g., SHA-256 $\rightarrow$ 32 bytes word size).
-Each time, it processes $k - r$ words, and generates $r$ hashes from the $k - r$ words uising $r$ different hashes.
-$$
-h_i = H(D, i), for \space i = 0,1,...,r-1
-$$
-
-- Convergent AONT-RS (CAONT-RS)
- Key idea:
- replace the random key employed in the AONT step of AONT-RS with a cryptographic hash generated from the secret.
-
-$$
-c_i = d_i \oplus E(h_{key}, i), for \space i = 0, 1, ..., s-1
-$$
-
-### Implementation and Evaluation
-- Investigate the throughput of CRSSS and CAONT-RS
-> measure the total amount of processed secret data divided by the computational time of generating all shares.
-> SHA-256: for default hash
-> AES-256: for default encryption function
-
-- Evaluate this performance
-> In local machine
-
-Insight: CRSSS can achieve a more flexible tradeoff between security and performance than CAONT-RS.
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-This paper does not analyze how the secret size affects the deduplication ratios for different dispersal algorithms. For CRSSS and CAONT-RS, since there are some constrains on the its secret sizes.
+---
+typora-copy-images-to: paper_figure
+---
+Convergent Dispersal: Toward Storage-Efficient Security in a Cloud-of Clouds
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| HotStorage'14 | dispersal, deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The keyless security of existing dispersal algorithms relies on embedded random information which breaks data deduplication of the dispersed data. This paper proposes convergent dispersal, which replaces the original random information with deterministic cryptographic hash which is derived from the original data.
+
+The Summary of existing dispersal algorithm
+
+
+
+### Convergent Dispersal
+This paper contains two convergent dispersal algorithms
+> CRSSS and CAONT-RS
+> both of them augment existing dispersal algorithms with the deduplication property.
+
+- Target Deployment Scenario
+Each compute server performs the cross-user deduplication.
+
+The security of existing dispersal algorithms depends on the embedded **random information**. Due to randomness, distinct secrets with identical content lead to different sets of shares (impede data deduplication)
+
+- Key idea
+ replace the original random information in existing dispersal algorithms with **deterministic hashes** generated from the secret.
+- Convergent RSSS (CRSSS)
+
+Given a secret, it splits it into a number of words which has the **same size** as hashes (e.g., SHA-256 $\rightarrow$ 32 bytes word size).
+Each time, it processes $k - r$ words, and generates $r$ hashes from the $k - r$ words uising $r$ different hashes.
+$$
+h_i = H(D, i), for \space i = 0,1,...,r-1
+$$
+
+- Convergent AONT-RS (CAONT-RS)
+ Key idea:
+ replace the random key employed in the AONT step of AONT-RS with a cryptographic hash generated from the secret.
+
+$$
+c_i = d_i \oplus E(h_{key}, i), for \space i = 0, 1, ..., s-1
+$$
+
+### Implementation and Evaluation
+- Investigate the throughput of CRSSS and CAONT-RS
+> measure the total amount of processed secret data divided by the computational time of generating all shares.
+> SHA-256: for default hash
+> AES-256: for default encryption function
+
+- Evaluate this performance
+> In local machine
+
+Insight: CRSSS can achieve a more flexible tradeoff between security and performance than CAONT-RS.
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+This paper does not analyze how the secret size affects the deduplication ratios for different dispersal algorithms. For CRSSS and CAONT-RS, since there are some constrains on the its secret sizes.
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/CDStore-ATC'15.md b/StoragePaperNote/Deduplication/Secure-Dedup/CDStore-ATC'15.md
old mode 100644
new mode 100755
index 99430cf..5c1483d
--- a/StoragePaperNote/Deduplication/Secure-Dedup/CDStore-ATC'15.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/CDStore-ATC'15.md
@@ -1,66 +1,66 @@
----
-typora-copy-images-to: paper_figure
----
-CDStore: Toward Reliable, Secure, and Cost-Efficient Cloud Storage via Convergent Dispersal
-------------------------------------------
-@ATC'15 @Cloud Deduplication
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-- Existing secret sharing algorithms prohibit storage savings achieved by **deduplication**. Thus, this paper tries to design a new multi-cloud storage system, which can provide a unified cloud storage solution with reliability, security, and cost efficiency guarantees. (Core: enable secret sharing with deduplication)
-
-
-### CDStore
-- Reliability
-> 1. Fault tolerance in cloud server side (cloud storage providers, CDStore servers) $\rightarrow$ Erasure Coding
-> 2. Fault tolerance in cloud client side $\rightarrow$ offloading metadata management to the server side
-
-- Security
-> 1. Exploits multi-cloud diversity (as long as a tolerabel number of clouds are uncompromised.)
-> 2. Two-stage deduplication to avoid insider side-channel attacks launched by malicious users.
-
-- Cost Efficiency
-> 1. use deduplication to reduce both bandwidth and storage costs.
-
-#### Convergent Dispersal
-Goal: two secrets with identical content must generate identical shares. (make deduplication possible)
-> replacing the embedded random input with a deterministic cryptographic hash drived from the secret.
-
-- CAONT-RS (a new instantiation of convergent dispersal)
-> 1. Improve performance: replaces the Rivest's AONT with another AONT based on **optimal asymmetric encryption padding** (OAEP) (small size words $\rightarrow$ a large-size, constant-value block)
->
-> 2. Support deduplications: replaces the random key in AONT with a deterministic cryptographic hash derived from the secret. (preserves **content similarity**)
->
-
-- Two-Stage Deduplication
-Goal: reduce the storage overhead and upload overhead, as well as defend side-channel attacks.
-> Client side: before a user uploads data to a cloud, it first generates fingerprints of data, and then checks (queries) with the cloud by fingerprint for the existence of any deduplicate data that has been uploaded by any user.
-
-> Server side: After CDStore server receives the shares from clients, it generates a fingerprint from each share (**re-compute again**, instead of the use the one generated by client.). And check with its deduplication index again.
-> The reason: to prevent the side-channel attack to gain unauthorized access to the share
-
-### Implementation and Evaluation
-- Some implementation details
-> 1. To reduce network I/Os: batch the shares to be uploaded to each cloud in a 4MB buffer.
-> 2. Variable-size chunking: Rabin fingerprinting
-> 3. To provide the reliability: offloading the metadata management in server side. distribute metadata across all CDStore servers for reliability.
-> 4. Multi-threading: intensive encoding/decoding operations at secret level, utilize the network transfer bandwidth.
-
-- Evaluation
-> 1. Encoding speed in client side (compare with AONT-RS, CAONT-RS-Rivest)
-> 2. Deduplication efficiency (deduplication saving)
-> 3. Transfer speed: Upload speed, download speed. (Two cases: duplicate data, trace-driven )
-> 4. Cost Analysis: estimate the monetary costs using the pricing models of Amazon EC2 and S3.
-
-## 2. Strength (Contributions of the paper)
-- This paper proposes CAONT-RS, which can improve the performance of AONT-RS, and also can allow deduplication by using deterministric hashes.
-- This paper also implements the CDStore system with a new instantiation of convergent dispersal, which can achieve higher throughput than its prior approach.
-- It also conducts a series of experiments and cost analysis to prove its design.
-
-## 3. Weakness (Limitations of the paper)
-- In its two-stage deduplication, the server needs to re-compute the fingerprint to defend the attacker, I think the overhead of this scheme is a lttile bit high.
-- CDStore does not consider how to further reduce the overhead of deduplication metadata.
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+CDStore: Toward Reliable, Secure, and Cost-Efficient Cloud Storage via Convergent Dispersal
+------------------------------------------
+@ATC'15 @Cloud Deduplication
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+- Existing secret sharing algorithms prohibit storage savings achieved by **deduplication**. Thus, this paper tries to design a new multi-cloud storage system, which can provide a unified cloud storage solution with reliability, security, and cost efficiency guarantees. (Core: enable secret sharing with deduplication)
+
+
+### CDStore
+- Reliability
+> 1. Fault tolerance in cloud server side (cloud storage providers, CDStore servers) $\rightarrow$ Erasure Coding
+> 2. Fault tolerance in cloud client side $\rightarrow$ offloading metadata management to the server side
+
+- Security
+> 1. Exploits multi-cloud diversity (as long as a tolerabel number of clouds are uncompromised.)
+> 2. Two-stage deduplication to avoid insider side-channel attacks launched by malicious users.
+
+- Cost Efficiency
+> 1. use deduplication to reduce both bandwidth and storage costs.
+
+#### Convergent Dispersal
+Goal: two secrets with identical content must generate identical shares. (make deduplication possible)
+> replacing the embedded random input with a deterministic cryptographic hash drived from the secret.
+
+- CAONT-RS (a new instantiation of convergent dispersal)
+> 1. Improve performance: replaces the Rivest's AONT with another AONT based on **optimal asymmetric encryption padding** (OAEP) (small size words $\rightarrow$ a large-size, constant-value block)
+>
+> 2. Support deduplications: replaces the random key in AONT with a deterministic cryptographic hash derived from the secret. (preserves **content similarity**)
+>
+
+- Two-Stage Deduplication
+Goal: reduce the storage overhead and upload overhead, as well as defend side-channel attacks.
+> Client side: before a user uploads data to a cloud, it first generates fingerprints of data, and then checks (queries) with the cloud by fingerprint for the existence of any deduplicate data that has been uploaded by any user.
+
+> Server side: After CDStore server receives the shares from clients, it generates a fingerprint from each share (**re-compute again**, instead of the use the one generated by client.). And check with its deduplication index again.
+> The reason: to prevent the side-channel attack to gain unauthorized access to the share
+
+### Implementation and Evaluation
+- Some implementation details
+> 1. To reduce network I/Os: batch the shares to be uploaded to each cloud in a 4MB buffer.
+> 2. Variable-size chunking: Rabin fingerprinting
+> 3. To provide the reliability: offloading the metadata management in server side. distribute metadata across all CDStore servers for reliability.
+> 4. Multi-threading: intensive encoding/decoding operations at secret level, utilize the network transfer bandwidth.
+
+- Evaluation
+> 1. Encoding speed in client side (compare with AONT-RS, CAONT-RS-Rivest)
+> 2. Deduplication efficiency (deduplication saving)
+> 3. Transfer speed: Upload speed, download speed. (Two cases: duplicate data, trace-driven )
+> 4. Cost Analysis: estimate the monetary costs using the pricing models of Amazon EC2 and S3.
+
+## 2. Strength (Contributions of the paper)
+- This paper proposes CAONT-RS, which can improve the performance of AONT-RS, and also can allow deduplication by using deterministric hashes.
+- This paper also implements the CDStore system with a new instantiation of convergent dispersal, which can achieve higher throughput than its prior approach.
+- It also conducts a series of experiments and cost analysis to prove its design.
+
+## 3. Weakness (Limitations of the paper)
+- In its two-stage deduplication, the server needs to re-compute the fingerprint to defend the attacker, I think the overhead of this scheme is a lttile bit high.
+- CDStore does not consider how to further reduce the overhead of deduplication metadata.
+
+## 4. Future Works
- As mention in this paper, I think one pential issue that can be done is to consider other different techniques to further deduplicate with the metadata.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/ClearBox-CCS'15.md b/StoragePaperNote/Deduplication/Secure-Dedup/ClearBox-CCS'15.md
old mode 100644
new mode 100755
index 3ec2f33..baad007
--- a/StoragePaperNote/Deduplication/Secure-Dedup/ClearBox-CCS'15.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/ClearBox-CCS'15.md
@@ -1,91 +1,91 @@
----
-typora-copy-images-to: ../paper_figure
----
-Transparent Data Deduplication in the Cloud
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CCS'15 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-For current cloud deduplication system, user does not directly benefit from the storage reducation.
-> As there is no transparent relation between effective storage costs and the prices offered to the users.
-> the cloud storage service providers charge their customers based on the amount of data that they store - irrespective of the level of data deduplication exhibited by their data.
-
-
-This paper proposes ClearBox, which enables cloud users to verify the effective storage space that their data is occupying in the cloud, and consequently to check whether they qualify for benefit such as price reductions.
-> relay on gateway to orchestrate cross-user file-based deduplication
-> provide its users with full transparency on the storage savings exhibited by their users.
-
-
-- Cost reducation estimation
-There is considerable room for price reductions for those users whose data undergoes considerable deduplication
-
-
-
-### ClearBox
-- Why it needs the additional gateway
-If using a decentralized scheme, it requires interaction among users, and is unlikely to scale as the number of users storing the same data increases.
-> thus, it argues the gateway is a logically centralized entity. (orchestrate cross-user-file-based deduplication)
-
-- System model
-Time is divided into epochs $E_i$ of equal length. Clients receive from $G$ a list of their files and **deduplication pattern** at the end of every epoch.
-> deduplication pattern of a given file: the number of users that store the same deduplicated file in the cloud.
-
-1. Attest Protocol
-This operation is executed by the gateway only, it generates a proof of cardinality of a given file ID and an epoch $E$ that attests an upper bound for the number of clients registered to this file within this epoch.
-
-2. Verify Protocol
-customers can verify the proof generated by $Attest$ protocol to verify the upper bound on the total number of file users.
-
-- Security model
-1. Maliciou users
-2. Rational gateway: the gateway and the storage provider will only deviate from the protocol if such a strategy increases their profit in the system
-3. This paper argues that the achievable security in its case is therefore that of MLE schemes.
-> guarantees privacy under chosen distribution attacks.
-
-- System design
-1. A novel Merkle-tree based cryptographich accumulator which is maintained by the gateway.
-> 1. accumulate the IDs of the users registered to the same file within the same time epoch.
-> 2. encode the upper bound on the number of accumulated value, and enable any legitimate client associated to the accumulator to verify this bound.
-
-
-2. Oblivious server-aided key generation protocol
-against brute force search attacks
-ensures a curious gateway/storage provider cannot acquire the necessary keys to decrypt the file.
-> sends the bliended hash $h$ to the gateway
-
-3. Time-Dependent Randomness
-Ensure the selection of files to which the gateway attests the deduplication pattern is randomly chosen and not precomputed.
-> Base on bitcoin
-
-### Implementation and Evaluation
-- Implementation
-> Java, JPBC library, BLS signatures
-> Amazon S3 and Dropbox
-> Gateway: it spawns multiple threads on $G$'s machine, each thread corresponding to a unique worker handling requests/bills of a given client.
-> Store the metadata in local MySQL database.
-
-- Evaluation
-Setting: NetEM to shape all traffic exchanged on the networking interfaces.
-Compared with DupLESS:
-> 1. key generation overhead (latency)
-> 2. Proof verification overhead
-> 3. GET, PUT latency
-
-## 2. Strength (Contributions of the paper)
-1. This paper proposes a novel cloud storage scheme based on a cryptographich tree-based accumulator to attest in logarithmic time the deduplication patterns of every file stored in the cloud.
-2. For security analysis, this paper shows it can resist against malicious clients and a curious storage provider.
-
-
-## 3. Weakness (Limitations of the paper)
-1. this paper is not easy to fully understand.
-2. In this work, it cannot hide the konwledge of file sizes and user access pattern
-
-## 4. Future Works
-1. This paper mentions a concept, called cardinality-proving accumulator which leverages Merkle tree in order to efficiently provide proofs of membership and (non-public) proofs of maximum set cardinality.
-
-2. This paper mentions for the key generation process, the BLS signature is faster than RSA signature in DupLESS to compute by the key server. But BLS signature is more expensive to verify by clients
-> this paper argues the client effort is dominated by hashing the file.
+---
+typora-copy-images-to: ../paper_figure
+---
+Transparent Data Deduplication in the Cloud
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| CCS'15 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+For current cloud deduplication system, user does not directly benefit from the storage reducation.
+> As there is no transparent relation between effective storage costs and the prices offered to the users.
+> the cloud storage service providers charge their customers based on the amount of data that they store - irrespective of the level of data deduplication exhibited by their data.
+
+
+This paper proposes ClearBox, which enables cloud users to verify the effective storage space that their data is occupying in the cloud, and consequently to check whether they qualify for benefit such as price reductions.
+> relay on gateway to orchestrate cross-user file-based deduplication
+> provide its users with full transparency on the storage savings exhibited by their users.
+
+
+- Cost reducation estimation
+There is considerable room for price reductions for those users whose data undergoes considerable deduplication
+
+
+
+### ClearBox
+- Why it needs the additional gateway
+If using a decentralized scheme, it requires interaction among users, and is unlikely to scale as the number of users storing the same data increases.
+> thus, it argues the gateway is a logically centralized entity. (orchestrate cross-user-file-based deduplication)
+
+- System model
+Time is divided into epochs $E_i$ of equal length. Clients receive from $G$ a list of their files and **deduplication pattern** at the end of every epoch.
+> deduplication pattern of a given file: the number of users that store the same deduplicated file in the cloud.
+
+1. Attest Protocol
+This operation is executed by the gateway only, it generates a proof of cardinality of a given file ID and an epoch $E$ that attests an upper bound for the number of clients registered to this file within this epoch.
+
+2. Verify Protocol
+customers can verify the proof generated by $Attest$ protocol to verify the upper bound on the total number of file users.
+
+- Security model
+1. Maliciou users
+2. Rational gateway: the gateway and the storage provider will only deviate from the protocol if such a strategy increases their profit in the system
+3. This paper argues that the achievable security in its case is therefore that of MLE schemes.
+> guarantees privacy under chosen distribution attacks.
+
+- System design
+1. A novel Merkle-tree based cryptographich accumulator which is maintained by the gateway.
+> 1. accumulate the IDs of the users registered to the same file within the same time epoch.
+> 2. encode the upper bound on the number of accumulated value, and enable any legitimate client associated to the accumulator to verify this bound.
+
+
+2. Oblivious server-aided key generation protocol
+against brute force search attacks
+ensures a curious gateway/storage provider cannot acquire the necessary keys to decrypt the file.
+> sends the bliended hash $h$ to the gateway
+
+3. Time-Dependent Randomness
+Ensure the selection of files to which the gateway attests the deduplication pattern is randomly chosen and not precomputed.
+> Base on bitcoin
+
+### Implementation and Evaluation
+- Implementation
+> Java, JPBC library, BLS signatures
+> Amazon S3 and Dropbox
+> Gateway: it spawns multiple threads on $G$'s machine, each thread corresponding to a unique worker handling requests/bills of a given client.
+> Store the metadata in local MySQL database.
+
+- Evaluation
+Setting: NetEM to shape all traffic exchanged on the networking interfaces.
+Compared with DupLESS:
+> 1. key generation overhead (latency)
+> 2. Proof verification overhead
+> 3. GET, PUT latency
+
+## 2. Strength (Contributions of the paper)
+1. This paper proposes a novel cloud storage scheme based on a cryptographich tree-based accumulator to attest in logarithmic time the deduplication patterns of every file stored in the cloud.
+2. For security analysis, this paper shows it can resist against malicious clients and a curious storage provider.
+
+
+## 3. Weakness (Limitations of the paper)
+1. this paper is not easy to fully understand.
+2. In this work, it cannot hide the konwledge of file sizes and user access pattern
+
+## 4. Future Works
+1. This paper mentions a concept, called cardinality-proving accumulator which leverages Merkle tree in order to efficiently provide proofs of membership and (non-public) proofs of maximum set cardinality.
+
+2. This paper mentions for the key generation process, the BLS signature is faster than RSA signature in DupLESS to compute by the key server. But BLS signature is more expensive to verify by clients
+> this paper argues the client effort is dominated by hashing the file.
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/DistributedKeyGen-CCSW'14.md b/StoragePaperNote/Deduplication/Secure-Dedup/DistributedKeyGen-CCSW'14.md
old mode 100644
new mode 100755
index 2528bb5..32130e9
--- a/StoragePaperNote/Deduplication/Secure-Dedup/DistributedKeyGen-CCSW'14.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/DistributedKeyGen-CCSW'14.md
@@ -1,94 +1,94 @@
----
-typora-copy-images-to: ../paper_figure
----
-# Distributed Key Generation for Encrypted Deduplication: Achieving the Strongest Privacy
-
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CCSW'14 | Encrypted Deduplication |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-1. Provide a new security notion
-For server-aided MLE
-> show it is strictly stronger than all relevant notions. Lacking in original paper
-
-2. Introduce a distributed protocol
-eliminates the need of the key server
-> allows less managed system such as **P2P** systems to enjoy the high security level.
-
-- The limitations of DupLESS
-relaying on a dedicated key server
-> difficult to deploy in a less managed setting such as P2P systems
-> impairs its security, as compromising a single key server reduces its protection to that of CE.
-
-### Distributed Key Generation
-
-- Security notion
- - A new security notion of the encrypted deduplication: D-IND$CPA
- - ciphertext indistinguishability (IND)
- - the adversary is restricted to querying the encryption oracle with *only distinct messages*
- - MLE or D-PKE cannot be semantically secure, as they leak **message equality**.
-
-- Eliminating the key server
-
-1. A distributed protocol removes the need for a centralized key server.
-> for P2P paradigm, it attains the same security as DupLESS
-
-2. As long as the user obtains the cooperation of any qualified subset of *players*, it can perform the desired operation.
-
-3. Threshold signature
-use Shoup's RSA-based scheme
-> variant of RSA threshold signature scheme
-
-Distributed oblivious key generation (DOPG)
-> signature shares
-> proof of correctness
-> combining shares
-> blinding: blind signature shares
-
-
-### Implementation and Evaluation
-- Implementation
- - Shoup's threshold signature with a number of optimizations to improve its performance
- - in Java
-
-- Evaluation
- - Microbenchmarks
- - Computation time
- - the client latency for the entire key generation process, including both computation
- - signer's signing time and client's combining time
- - network latency
- - Impact on upload throughput
-
-## 2. Strength (Contributions of the paper)
-1. propose a new security notion
-
-DupLESS lacks a rigorous security notion to verify its security
-
-2. a distributed key generation scheme
-For P2P system
-Based on threshold signature
-
-## 3. Weakness (Limitations of the paper)
-1. the performance overhead of Shoup's RSA signature scheme is still high.
-
-
-## 4. Some Insights (Future work)
-1. The weakness of convergent encryption
-lacking a rigorous theoretical treatment.
-> Server-aided MLE provides the best possible security for encrypted deduplication.
-
-2. P2P system
-the data can be scattered in a P2P fashion among the users.
-> without a storage service provider.
-
-3. semantic security
-if any probabilistic, polynomial-time adversary that is given the ciphertext of a certain message, and the message's length, cannot determine any partial information on the message.
-
-4. attack type in secure definition
-> CPA: chosen plaintext attacks
-> CCA1: chosen ciphertext attack
+---
+typora-copy-images-to: ../paper_figure
+---
+# Distributed Key Generation for Encrypted Deduplication: Achieving the Strongest Privacy
+
+| Venue | Category |
+| :------------------------: | :------------------: |
+| CCSW'14 | Encrypted Deduplication |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+1. Provide a new security notion
+For server-aided MLE
+> show it is strictly stronger than all relevant notions. Lacking in original paper
+
+2. Introduce a distributed protocol
+eliminates the need of the key server
+> allows less managed system such as **P2P** systems to enjoy the high security level.
+
+- The limitations of DupLESS
+relaying on a dedicated key server
+> difficult to deploy in a less managed setting such as P2P systems
+> impairs its security, as compromising a single key server reduces its protection to that of CE.
+
+### Distributed Key Generation
+
+- Security notion
+ - A new security notion of the encrypted deduplication: D-IND$CPA
+ - ciphertext indistinguishability (IND)
+ - the adversary is restricted to querying the encryption oracle with *only distinct messages*
+ - MLE or D-PKE cannot be semantically secure, as they leak **message equality**.
+
+- Eliminating the key server
+
+1. A distributed protocol removes the need for a centralized key server.
+> for P2P paradigm, it attains the same security as DupLESS
+
+2. As long as the user obtains the cooperation of any qualified subset of *players*, it can perform the desired operation.
+
+3. Threshold signature
+use Shoup's RSA-based scheme
+> variant of RSA threshold signature scheme
+
+Distributed oblivious key generation (DOPG)
+> signature shares
+> proof of correctness
+> combining shares
+> blinding: blind signature shares
+
+
+### Implementation and Evaluation
+- Implementation
+ - Shoup's threshold signature with a number of optimizations to improve its performance
+ - in Java
+
+- Evaluation
+ - Microbenchmarks
+ - Computation time
+ - the client latency for the entire key generation process, including both computation
+ - signer's signing time and client's combining time
+ - network latency
+ - Impact on upload throughput
+
+## 2. Strength (Contributions of the paper)
+1. propose a new security notion
+
+DupLESS lacks a rigorous security notion to verify its security
+
+2. a distributed key generation scheme
+For P2P system
+Based on threshold signature
+
+## 3. Weakness (Limitations of the paper)
+1. the performance overhead of Shoup's RSA signature scheme is still high.
+
+
+## 4. Some Insights (Future work)
+1. The weakness of convergent encryption
+lacking a rigorous theoretical treatment.
+> Server-aided MLE provides the best possible security for encrypted deduplication.
+
+2. P2P system
+the data can be scattered in a P2P fashion among the users.
+> without a storage service provider.
+
+3. semantic security
+if any probabilistic, polynomial-time adversary that is given the ciphertext of a certain message, and the message's length, cannot determine any partial information on the message.
+
+4. attack type in secure definition
+> CPA: chosen plaintext attacks
+> CCA1: chosen ciphertext attack
> CCA2: adaptive chosen ciphertext attack
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/DupLESS-Security'13.md b/StoragePaperNote/Deduplication/Secure-Dedup/DupLESS-Security'13.md
old mode 100644
new mode 100755
index 64e8ac6..e2f05e7
--- a/StoragePaperNote/Deduplication/Secure-Dedup/DupLESS-Security'13.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/DupLESS-Security'13.md
@@ -1,53 +1,53 @@
----
-typora-copy-images-to: paper_figure
----
-DupLESS: Server-Aided Encryption for Deduplicated Storage
-------------------------------------------
-| Venue | Category |
-| :----------------: | :------------------: |
-| USENIX Security'13 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-In cloud deduplication, Message-locked encryption (MLE) is inherently subject to brute-force attacks that can recover files falling into a know set. This paper proposes an key-server based architecture that can provides secure deduplicated storage resisting brute-force attacks.
-> The weakness of **convergent encryption** (CE): deterministic and keyless, security only is possible when the target message is too **large** to exhaust. (unpredictable)
-
-- Secure deduplication: Dedup with strong security against untrusted storage
-- Compromise resilience: under client compromise
-
-### DupLESS (Duplicateless Encryption for Simple Storage)
-- It contains a group of affiliated clients, a key server
-> Goal: make DupLESS work **transparently** with existing Storage Service system. (Site as a layer on top of existing simple storage service interfaces)
-
-- Oblivious PRF (OPRF) protocol (Server-aided encryption)
-$F$: A pseudorandom function (PRF)
-$K \leftarrow F(K_s, H(f))$ ($f$ is the file)
-> OPRF: server learns nothing, client learns on $K$
-
-- Against online brute-force attack:
-Goal: slow down online brute-force trials from attacker controlled clients
-> limit clients to send $q$ queries per epoch.
-
-- System Overview
-
-> Encrypt and decrypt files
-> Handle file names and paths
-> Run Transparently: low overhead, still can run when KS is down, no client-side state
-
-### Implementation and Evaluation
-- DupLESS client: python, support **Dropbox** and **Google Drive**. (command-line interface)
-- Evaluation
-> 1. Bandwidth overhead: DupLESS bandwidth overhead compared to plain Dropbox
-> 2. Retrieval latency: vary file size, compare with plain Dropbox and Covergent Encryption.
-> 3. Storage Overhead: DupLESS storage overhead compared to dedup over plaintexts
-
-## 2. Strength (Contributions of the paper)
-1. The system design is very considerable. When authors design DupLESS, they consider deduplication, compromise resilience and brute-force attack resilience.
-2. To achieve low performance overhead, this paper also considers the optimization in client-to-KS OPRF protocol.
-3. To guarantee the performance, DupLESS aims to work as a transparent layer over the storage service system.
-## 3. Weakness (Limitations of the paper)
-1. Compared with plain Dropbox or Google Drive, DupLESS incurs the higher overhead on storage overhead (encrypted user key from user) so as to gain the improvement in security.
-## 4. Future Works
-1. How about do the deduplication over the metadata (e.g., the *key* used to encrypt data and be encrypted by user's private key)? Although the size of this part of data may be very small, they also store in back-end storage service.
-2. Can it leveage the characteristic of the data workload to further optimize the deduplication process?
+---
+typora-copy-images-to: paper_figure
+---
+DupLESS: Server-Aided Encryption for Deduplicated Storage
+------------------------------------------
+| Venue | Category |
+| :----------------: | :------------------: |
+| USENIX Security'13 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+In cloud deduplication, Message-locked encryption (MLE) is inherently subject to brute-force attacks that can recover files falling into a know set. This paper proposes an key-server based architecture that can provides secure deduplicated storage resisting brute-force attacks.
+> The weakness of **convergent encryption** (CE): deterministic and keyless, security only is possible when the target message is too **large** to exhaust. (unpredictable)
+
+- Secure deduplication: Dedup with strong security against untrusted storage
+- Compromise resilience: under client compromise
+
+### DupLESS (Duplicateless Encryption for Simple Storage)
+- It contains a group of affiliated clients, a key server
+> Goal: make DupLESS work **transparently** with existing Storage Service system. (Site as a layer on top of existing simple storage service interfaces)
+
+- Oblivious PRF (OPRF) protocol (Server-aided encryption)
+$F$: A pseudorandom function (PRF)
+$K \leftarrow F(K_s, H(f))$ ($f$ is the file)
+> OPRF: server learns nothing, client learns on $K$
+
+- Against online brute-force attack:
+Goal: slow down online brute-force trials from attacker controlled clients
+> limit clients to send $q$ queries per epoch.
+
+- System Overview
+
+> Encrypt and decrypt files
+> Handle file names and paths
+> Run Transparently: low overhead, still can run when KS is down, no client-side state
+
+### Implementation and Evaluation
+- DupLESS client: python, support **Dropbox** and **Google Drive**. (command-line interface)
+- Evaluation
+> 1. Bandwidth overhead: DupLESS bandwidth overhead compared to plain Dropbox
+> 2. Retrieval latency: vary file size, compare with plain Dropbox and Covergent Encryption.
+> 3. Storage Overhead: DupLESS storage overhead compared to dedup over plaintexts
+
+## 2. Strength (Contributions of the paper)
+1. The system design is very considerable. When authors design DupLESS, they consider deduplication, compromise resilience and brute-force attack resilience.
+2. To achieve low performance overhead, this paper also considers the optimization in client-to-KS OPRF protocol.
+3. To guarantee the performance, DupLESS aims to work as a transparent layer over the storage service system.
+## 3. Weakness (Limitations of the paper)
+1. Compared with plain Dropbox or Google Drive, DupLESS incurs the higher overhead on storage overhead (encrypted user key from user) so as to gain the improvement in security.
+## 4. Future Works
+1. How about do the deduplication over the metadata (e.g., the *key* used to encrypt data and be encrypted by user's private key)? Although the size of this part of data may be very small, they also store in back-end storage service.
+2. Can it leveage the characteristic of the data workload to further optimize the deduplication process?
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/EnhancedThreshold-TDSC'16.md b/StoragePaperNote/Deduplication/Secure-Dedup/EnhancedThreshold-TDSC'16.md
old mode 100644
new mode 100755
index 659929d..14a7b45
--- a/StoragePaperNote/Deduplication/Secure-Dedup/EnhancedThreshold-TDSC'16.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/EnhancedThreshold-TDSC'16.md
@@ -1,90 +1,90 @@
----
-typora-copy-images-to: ../paper_figure
----
-Enhanced Secure Thresholded Data Deduplication Scheme for Cloud Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| TDSC'16 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Outsourced data may require different levels of protection, depending on how "popular" the datum is.
-> this differentiation based on popularity also can allevuates the user's need to mannually sort the data as
-> common: deduplicated
-> potentially sensitive (non-deduplicate)
-
-Duplicate occurs only when the file becomes popular (i.e., shared by many users)
-> strictly confidential files that must never be deduplicated.
-
-### Enhanced threshold-based deduplicated encryption
-- Target scenario
-the outsourced dataset contains **few instances of some data items** and **many instances of others**.
-> two backup types:
-> 1. uploaded by many users: benefit strongly from deduplication
-> 2. uploaded by one or very few users only: require confidentiality
-
-- Multi-layered cryptosystem
-1. the inner layer is applied using **convergent encryption**.
-2. the outer layer is applied using **a semantically secure cryptosystem**.
-
-
-- System model
-Add two trusted entities: identity provider and index repository service.
-> 1. identity provider: users are identified when a user first joins the system.
-> 2. index repository service: generate the file identity.
-> 
-
-- Security Model
-The method of this paper achieves two kinds of security notions:
-> 1. semantic security for unpopular data
-> 2. conventional convergent security for popular data
-> prevent the storage provider which is honest-but-curious.
-
-- Popularity definition
-1. a system-wide popularity limit
-Represents the smallest number of **distinct, legitimate** users that need to upload a given file $F$ for that file to be declared popular.
-
-2. A file shall be decleared popular
-When at least $t$ uploads for this file have taken place.
-> $t \geq p_{lim} + n_{A}$
-> This paper just considers a single threshold $t$.
-
-### Implementation and Evaluation
-- Setting
-Dataset
-> 1. pirate bay dataset: each torrent represents a file (file popularity)
-> 2. Ubuntu popularity contest
-
-- Compare with three related work
-Dupless, ClearBox, PAKE
-
-
-1. Storage space reducation
-This paper mentions the efficiency of its scheme depends on
-> 1. the value of $t$ (the tunable parameter of the scheme)
-> 2. the popularity distribution of files in the dataset
-
-2. Overhead
-> 1. Client computation
-> 2. Server computation
-> 3. Communication
-> 4. Dropbox interact
-
-## 2. Strength (Contributions of the paper)
-1. This paper proposes an enhanced threshold cryptosystem that leverage popularity and allows fine-grained trade-off between security and storage efficiency
-> trade-off between security and storage efficiency
-
-2. This paper also disscuss the overall security of the proposed scheme
-> how to improve it?
-
-## 3. Weakness (Limitations of the paper)
-1. In this paper's model, it needs to trust the identity provider (IdP) and Index Repository Service (IRS)
-
-
-## 4. Future Works
-1. This paper mentions the case that how to prevent the attacker from guessing whether two ciphertexts *produced by the same user* corrspond to the same plaintext with a non-negligible advantage.
-
-2. The first dataset used in this paper can indicate the file popularity.
+---
+typora-copy-images-to: ../paper_figure
+---
+Enhanced Secure Thresholded Data Deduplication Scheme for Cloud Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| TDSC'16 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Outsourced data may require different levels of protection, depending on how "popular" the datum is.
+> this differentiation based on popularity also can allevuates the user's need to mannually sort the data as
+> common: deduplicated
+> potentially sensitive (non-deduplicate)
+
+Duplicate occurs only when the file becomes popular (i.e., shared by many users)
+> strictly confidential files that must never be deduplicated.
+
+### Enhanced threshold-based deduplicated encryption
+- Target scenario
+the outsourced dataset contains **few instances of some data items** and **many instances of others**.
+> two backup types:
+> 1. uploaded by many users: benefit strongly from deduplication
+> 2. uploaded by one or very few users only: require confidentiality
+
+- Multi-layered cryptosystem
+1. the inner layer is applied using **convergent encryption**.
+2. the outer layer is applied using **a semantically secure cryptosystem**.
+
+
+- System model
+Add two trusted entities: identity provider and index repository service.
+> 1. identity provider: users are identified when a user first joins the system.
+> 2. index repository service: generate the file identity.
+> 
+
+- Security Model
+The method of this paper achieves two kinds of security notions:
+> 1. semantic security for unpopular data
+> 2. conventional convergent security for popular data
+> prevent the storage provider which is honest-but-curious.
+
+- Popularity definition
+1. a system-wide popularity limit
+Represents the smallest number of **distinct, legitimate** users that need to upload a given file $F$ for that file to be declared popular.
+
+2. A file shall be decleared popular
+When at least $t$ uploads for this file have taken place.
+> $t \geq p_{lim} + n_{A}$
+> This paper just considers a single threshold $t$.
+
+### Implementation and Evaluation
+- Setting
+Dataset
+> 1. pirate bay dataset: each torrent represents a file (file popularity)
+> 2. Ubuntu popularity contest
+
+- Compare with three related work
+Dupless, ClearBox, PAKE
+
+
+1. Storage space reducation
+This paper mentions the efficiency of its scheme depends on
+> 1. the value of $t$ (the tunable parameter of the scheme)
+> 2. the popularity distribution of files in the dataset
+
+2. Overhead
+> 1. Client computation
+> 2. Server computation
+> 3. Communication
+> 4. Dropbox interact
+
+## 2. Strength (Contributions of the paper)
+1. This paper proposes an enhanced threshold cryptosystem that leverage popularity and allows fine-grained trade-off between security and storage efficiency
+> trade-off between security and storage efficiency
+
+2. This paper also disscuss the overall security of the proposed scheme
+> how to improve it?
+
+## 3. Weakness (Limitations of the paper)
+1. In this paper's model, it needs to trust the identity provider (IdP) and Index Repository Service (IRS)
+
+
+## 4. Future Works
+1. This paper mentions the case that how to prevent the attacker from guessing whether two ciphertexts *produced by the same user* corrspond to the same plaintext with a non-negligible advantage.
+
+2. The first dataset used in this paper can indicate the file popularity.
> Pirate Bay dataset
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/FrequencyAnalysis-DSN'17.md b/StoragePaperNote/Deduplication/Secure-Dedup/FrequencyAnalysis-DSN'17.md
old mode 100644
new mode 100755
index 6b0aeac..373bd07
--- a/StoragePaperNote/Deduplication/Secure-Dedup/FrequencyAnalysis-DSN'17.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/FrequencyAnalysis-DSN'17.md
@@ -1,47 +1,47 @@
----
-typora-copy-images-to: paper_figure
----
-Information Leakage in Encrypted Deduplication via Frequency Analysis
-------------------------------------------
-| Venue | Category |
-| :----: | :------------------: |
-| DSN'17 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Existing MLE implementations still cannot fully protect against content leakage, mainly because their encryption approaches are deterministic.
-Previous stduies have addressed the possibility of launching frequency analysis MLE-based storage and also proposed cryptographic mechanisms to mitigate the issue.
-> Those investigations are theoretically driven.
-
-### Locality Based Attack
-- Chunk Locality
-This is very prevalent in backup workloads that chunks are likely to re-occur together with their neighboring chunks across backups.
-> rationale: changes to backups often appear in few clustered regions of chunks.
-
-In security perspective: if a plaintext chunk $M$ corresponds to a ciphertext chunk $C$, then the neighboring plaintext chunks of $M$ are likely to correspond to the neighboring ciphertext chunks of $C$.
-
-- Threat Model
-An attacker may obtain the auxiliary information (as the plaintext chunks of a prior backup).
-> Ciphertext-only mode: can only access the ciphertext chunks
-> Known-plaintext mode: a more powerful adversary that knows a small fraction of the ciphertext-plaintext chunks pairs about the latest backup.
-
-
-- Attack Model
-Message, ciphertext model:
-> $M_1, M_2, ....$, $C_1, C_2, ...$ shows the logical order for message chunks and ciphertext chunks.
-
-**Basic Attack**:
-It takes in $C$ and $M$ as input, and returns the result set of all inferred ciphertext-plaintext chunk pairs.
-> just counting the frequency of each kind of chunks, and sort them to generate ranking, and find the message and ciphertext chunks with the same ranking.
-
-This kind of attack is sensitive to data updates that occur across different versions of backups over time.
-> it exists many ties, i.e., chunks have the same frequency. How to break ties during sorting also affects the frequency rank.
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+Information Leakage in Encrypted Deduplication via Frequency Analysis
+------------------------------------------
+| Venue | Category |
+| :----: | :------------------: |
+| DSN'17 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Existing MLE implementations still cannot fully protect against content leakage, mainly because their encryption approaches are deterministic.
+Previous stduies have addressed the possibility of launching frequency analysis MLE-based storage and also proposed cryptographic mechanisms to mitigate the issue.
+> Those investigations are theoretically driven.
+
+### Locality Based Attack
+- Chunk Locality
+This is very prevalent in backup workloads that chunks are likely to re-occur together with their neighboring chunks across backups.
+> rationale: changes to backups often appear in few clustered regions of chunks.
+
+In security perspective: if a plaintext chunk $M$ corresponds to a ciphertext chunk $C$, then the neighboring plaintext chunks of $M$ are likely to correspond to the neighboring ciphertext chunks of $C$.
+
+- Threat Model
+An attacker may obtain the auxiliary information (as the plaintext chunks of a prior backup).
+> Ciphertext-only mode: can only access the ciphertext chunks
+> Known-plaintext mode: a more powerful adversary that knows a small fraction of the ciphertext-plaintext chunks pairs about the latest backup.
+
+
+- Attack Model
+Message, ciphertext model:
+> $M_1, M_2, ....$, $C_1, C_2, ...$ shows the logical order for message chunks and ciphertext chunks.
+
+**Basic Attack**:
+It takes in $C$ and $M$ as input, and returns the result set of all inferred ciphertext-plaintext chunk pairs.
+> just counting the frequency of each kind of chunks, and sort them to generate ranking, and find the message and ciphertext chunks with the same ranking.
+
+This kind of attack is sensitive to data updates that occur across different versions of backups over time.
+> it exists many ties, i.e., chunks have the same frequency. How to break ties during sorting also affects the frequency rank.
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/IndependentServer-CCS'15.md b/StoragePaperNote/Deduplication/Secure-Dedup/IndependentServer-CCS'15.md
old mode 100644
new mode 100755
index c8e6cb4..7161aa1
--- a/StoragePaperNote/Deduplication/Secure-Dedup/IndependentServer-CCS'15.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/IndependentServer-CCS'15.md
@@ -1,80 +1,80 @@
----
-typora-copy-images-to: ../paper_figure
----
-Secure Deduplication of Encrypted Data without Additional Independent Servers
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CCS'15 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Current solutions either use
-> 1. convergent encryption: susceptible to offline brute-force attacks
-> 2. require the aid of additional independent servers, which is very diffcult to meet in commerical contexts. (online brute-force)
-
-This paper argues that those schemes with additional independent servers setting are unrealistic in commercial setting and can be bottlenecks for both security and performance.
-
-This paper propses a secure cross-user deduplication scheme that supports client-side encryption **without requiring any additional independent servers**.
-> based on PAKE (password authenticated key exchange)
-
-### PAKE
-- Goal: allow a client uploading an existing file to securely obtain the encryption key that was used by the client who has previously uploaded that file.
-> via using a PAKE-based protocol to compute identical keys for different copies of the same file.
-
-- Password Authenticated Key Exchange
-
-
-Its deduplication scheme uses the SPAKE2 protocol, which is secure in the concurrent setting and random oracle model.
-> one client runs multiple PAKE protocol instances with other clients.
-> 
-
-In this work, clients use the hash values of their files as their respective "passwords".
-
-- Design goal
-1. Security goal: brute-force attack
-2. Functional goal: maximize deduplication effectiveness, minimize computational and communication overhead
-
-
-- Deduplication protocol
-
-
-1. Client
-Before uploading a file $F$, the uploader $C$ calculates both the cryptographic hash and the short hash, and send the short hash to the storage server.
-
-2. Sever
-finds some checker clients with the same short hash. Then let the client run the PAKE protocol with those checkers. (input is the cryptographic hash)
-
-
-
-### Implementation and Evaluation
-- Evaluation:
-1. Dataset
-use a dataset comprising of Android application prevalence data to represent an environment with media files.
-2. Dedup. percentage vs rate limits
-not perfect deduplication
-
-## 2. Strength (Contributions of the paper)
-1. This paper presents the first single-server scheme for cross-user deduplication
-> enables the client-side semantically secure encryption, proving its security in the malicious model.
-
-2. This method can provide the better security guarantees without introduing an identity server.
-
-
-## 3. Weakness (Limitations of the paper)
-1. This paper assumes there are enough online clients who have uploaded files with the required short hash. But this assumption is not very solid.
-
-2. This paper uses the additively homomorphic encryption in its protocol, and the overhead of PAKE is not very low, which makes this method
-
-## 4. Future Works
-1. This paper argues that per-client rate limiting is not fully effective against such online brute-force attack since an adversary who may compromise multiple clients.
-> this paper leverages the per-file rate limiting strategy to prevent online brute-force attack.
-
-2. This paper argues that a compromised storage server can easily mount an offline brute-force attack on the hash if a file is predictable.
-> in this paper, it lets a client send a short hash, and due to the high collision rate, the storage server cannot use this short hash to reliably guess the content of files offline.
-> 13 bits long hash
-
-3. This paper mentions the issue of user involvement. And it argues that it is burdensome for average users to let them aware which files are sensitive. And it also foregoes deduplication of sensitive files together
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Secure Deduplication of Encrypted Data without Additional Independent Servers
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| CCS'15 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Current solutions either use
+> 1. convergent encryption: susceptible to offline brute-force attacks
+> 2. require the aid of additional independent servers, which is very diffcult to meet in commerical contexts. (online brute-force)
+
+This paper argues that those schemes with additional independent servers setting are unrealistic in commercial setting and can be bottlenecks for both security and performance.
+
+This paper propses a secure cross-user deduplication scheme that supports client-side encryption **without requiring any additional independent servers**.
+> based on PAKE (password authenticated key exchange)
+
+### PAKE
+- Goal: allow a client uploading an existing file to securely obtain the encryption key that was used by the client who has previously uploaded that file.
+> via using a PAKE-based protocol to compute identical keys for different copies of the same file.
+
+- Password Authenticated Key Exchange
+
+
+Its deduplication scheme uses the SPAKE2 protocol, which is secure in the concurrent setting and random oracle model.
+> one client runs multiple PAKE protocol instances with other clients.
+> 
+
+In this work, clients use the hash values of their files as their respective "passwords".
+
+- Design goal
+1. Security goal: brute-force attack
+2. Functional goal: maximize deduplication effectiveness, minimize computational and communication overhead
+
+
+- Deduplication protocol
+
+
+1. Client
+Before uploading a file $F$, the uploader $C$ calculates both the cryptographic hash and the short hash, and send the short hash to the storage server.
+
+2. Sever
+finds some checker clients with the same short hash. Then let the client run the PAKE protocol with those checkers. (input is the cryptographic hash)
+
+
+
+### Implementation and Evaluation
+- Evaluation:
+1. Dataset
+use a dataset comprising of Android application prevalence data to represent an environment with media files.
+2. Dedup. percentage vs rate limits
+not perfect deduplication
+
+## 2. Strength (Contributions of the paper)
+1. This paper presents the first single-server scheme for cross-user deduplication
+> enables the client-side semantically secure encryption, proving its security in the malicious model.
+
+2. This method can provide the better security guarantees without introduing an identity server.
+
+
+## 3. Weakness (Limitations of the paper)
+1. This paper assumes there are enough online clients who have uploaded files with the required short hash. But this assumption is not very solid.
+
+2. This paper uses the additively homomorphic encryption in its protocol, and the overhead of PAKE is not very low, which makes this method
+
+## 4. Future Works
+1. This paper argues that per-client rate limiting is not fully effective against such online brute-force attack since an adversary who may compromise multiple clients.
+> this paper leverages the per-file rate limiting strategy to prevent online brute-force attack.
+
+2. This paper argues that a compromised storage server can easily mount an offline brute-force attack on the hash if a file is predictable.
+> in this paper, it lets a client send a short hash, and due to the high collision rate, the storage server cannot use this short hash to reliably guess the content of files offline.
+> 13 bits long hash
+
+3. This paper mentions the issue of user involvement. And it argues that it is burdensome for average users to let them aware which files are sensitive. And it also foregoes deduplication of sensitive files together
+
4. This paper mentions that very small sacrifice in deduplication ratio is offset by the significant advantage of ensuring user privacy without having to use independent third party servers.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/InformationLeakage-CCSW'16.md b/StoragePaperNote/Deduplication/Secure-Dedup/InformationLeakage-CCSW'16.md
old mode 100644
new mode 100755
index 7246a6e..ffb503d
--- a/StoragePaperNote/Deduplication/Secure-Dedup/InformationLeakage-CCSW'16.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/InformationLeakage-CCSW'16.md
@@ -1,87 +1,87 @@
----
-typora-copy-images-to: paper_figure
----
-On Information Leakage in Deduplication Storage Systems
-------------------------------------------
-| Venue | Category |
-| :---------: | :------------------: |
-| ACM CCSW'16 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper argues that all existing proposals share the same goal that enabling cloud providers to deduplicate encrypted data stored by their users. However, they do not consider the information leakage incurred by **access traces**.
-> here, this paper mainly considers the service providers
-
-Thus, here it considers the information leaked to a curious storage provider by systems that perform **client-side deduplication and encryption**.
-> storage provider can still acquire considerable information about the stored files without knowledge of the encryption key.
-
-### Method Name
-- System Model
-Client-side deduplication: the encryption key can be generated using the help of a key server.
-
-
-- Threat Model
-An adversary can acquire the access traces of storage server.
-> by compromise, coercion
-> access traces: entries containing a timestamp, the object ID, the object size.
-
-The adversary wants to identify which objects forms a file.
-
-- Storage Graph
-model any file $f$ as a tree $T(f)$
-> each leaf node represents an object of $f$ as output by the chunking algorithm.
-> Two kinds of nodes: file nodes and object nodes
-
-
-
-This bipartite graph can be built by examining at the access traces of the storage server, without the need to learn the contents of the stored files.
-> anonymity set
-> candidate set: $C(f, G)$
-
-- Storage Inference
-Goal: analyze the probability that a target file $f$ is stored on $S$ given the storage graph $G$.
-> File absence: $C(f, G) = \varnothing$
-> File presence: $C(f, G)\neq\varnothing $
-
-How to quantify the probability that $f$ is actually stored on $S$?
-> take into account how likely it is for any file $f^{'} \neq f$ to have $T(f^{'}) = T(f))$
-> 
-
-Base on **Bayesian Network**.
-
-
-- Template Attacks
-This kind of attack leverages the probability of storage of different files in order to infer higher-level contextual unformation.
-> the adversary knows the fixed part in template and tries to infer the variable parts of the stored contracts.
-
-- Quantifying Anonymity Sets
-the anonymity set of a file $f$ is the set of all possible files that have the same deduplication fingerprint as $f$.
-> 1. File-based fingerprints
-> 2. Fixed-sized block-based fingerprints
-> 3. CDC-based fingerprints
-
-**CDC-based fingerprints**
-a file of size $n$ can have a large number of possible CDC-based deduplication fingerprints.
-
-> this paper provides the lower bound for the number of possible deduplication fingerprints for a given file size $n$.
-> Based on an modified version of the **stars-and-bars theorem**.
-> Show two random files can have deduplication fingerprints collision.
-
-### Implementation and Evaluation
-1. It evaulates anonymity sets in four datasets.
-> CDC-based deduplication techniques exhibit a significantly smaller anonymity sets and therefore higher leakage when compared to those of fixed-block and file-based algorithms.
-> If a candidate for such a file is found
-
-**Conclusion**
-Existing public chunking algorithms allow an adversary to construct deduplication fingerprints and compare the simulated deduplication fingerprints to those on the storage.
-
-## 2. Strength (Contributions of the paper)
-1. it analyzes and quantify the information leakage to a curious storage server due the data deduplication, and shows content-based chunking methods offer clear distinguisher for stored content.
-2. Do the experiment in two real world dataset, and shows that existing CDC-based deduplication schemes leak information by up to 54%.
-3. It shows that fixed-sized block-based deduplication technologies establish a solid trade-off between the achieved privacy and the storage-efficiency performance.
-## 3. Weakness (Limitations of the paper)
-1. The theorem part is not very easy to understand.
-## 4. Future Works
-1. This work leverages the **Bayesian Network** to compute the probability of a storage. And this kind of model and method can be used in the deduplication scenario.
+---
+typora-copy-images-to: paper_figure
+---
+On Information Leakage in Deduplication Storage Systems
+------------------------------------------
+| Venue | Category |
+| :---------: | :------------------: |
+| ACM CCSW'16 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper argues that all existing proposals share the same goal that enabling cloud providers to deduplicate encrypted data stored by their users. However, they do not consider the information leakage incurred by **access traces**.
+> here, this paper mainly considers the service providers
+
+Thus, here it considers the information leaked to a curious storage provider by systems that perform **client-side deduplication and encryption**.
+> storage provider can still acquire considerable information about the stored files without knowledge of the encryption key.
+
+### Method Name
+- System Model
+Client-side deduplication: the encryption key can be generated using the help of a key server.
+
+
+- Threat Model
+An adversary can acquire the access traces of storage server.
+> by compromise, coercion
+> access traces: entries containing a timestamp, the object ID, the object size.
+
+The adversary wants to identify which objects forms a file.
+
+- Storage Graph
+model any file $f$ as a tree $T(f)$
+> each leaf node represents an object of $f$ as output by the chunking algorithm.
+> Two kinds of nodes: file nodes and object nodes
+
+
+
+This bipartite graph can be built by examining at the access traces of the storage server, without the need to learn the contents of the stored files.
+> anonymity set
+> candidate set: $C(f, G)$
+
+- Storage Inference
+Goal: analyze the probability that a target file $f$ is stored on $S$ given the storage graph $G$.
+> File absence: $C(f, G) = \varnothing$
+> File presence: $C(f, G)\neq\varnothing $
+
+How to quantify the probability that $f$ is actually stored on $S$?
+> take into account how likely it is for any file $f^{'} \neq f$ to have $T(f^{'}) = T(f))$
+> 
+
+Base on **Bayesian Network**.
+
+
+- Template Attacks
+This kind of attack leverages the probability of storage of different files in order to infer higher-level contextual unformation.
+> the adversary knows the fixed part in template and tries to infer the variable parts of the stored contracts.
+
+- Quantifying Anonymity Sets
+the anonymity set of a file $f$ is the set of all possible files that have the same deduplication fingerprint as $f$.
+> 1. File-based fingerprints
+> 2. Fixed-sized block-based fingerprints
+> 3. CDC-based fingerprints
+
+**CDC-based fingerprints**
+a file of size $n$ can have a large number of possible CDC-based deduplication fingerprints.
+
+> this paper provides the lower bound for the number of possible deduplication fingerprints for a given file size $n$.
+> Based on an modified version of the **stars-and-bars theorem**.
+> Show two random files can have deduplication fingerprints collision.
+
+### Implementation and Evaluation
+1. It evaulates anonymity sets in four datasets.
+> CDC-based deduplication techniques exhibit a significantly smaller anonymity sets and therefore higher leakage when compared to those of fixed-block and file-based algorithms.
+> If a candidate for such a file is found
+
+**Conclusion**
+Existing public chunking algorithms allow an adversary to construct deduplication fingerprints and compare the simulated deduplication fingerprints to those on the storage.
+
+## 2. Strength (Contributions of the paper)
+1. it analyzes and quantify the information leakage to a curious storage server due the data deduplication, and shows content-based chunking methods offer clear distinguisher for stored content.
+2. Do the experiment in two real world dataset, and shows that existing CDC-based deduplication schemes leak information by up to 54%.
+3. It shows that fixed-sized block-based deduplication technologies establish a solid trade-off between the achieved privacy and the storage-efficiency performance.
+## 3. Weakness (Limitations of the paper)
+1. The theorem part is not very easy to understand.
+## 4. Future Works
+1. This work leverages the **Bayesian Network** to compute the probability of a storage. And this kind of model and method can be used in the deduplication scenario.
2. This work analyzes that in CDC-based fingerprint chunk algorithm, it can have the deduplication fingerprients collision, and the adversary can exploit this to get information.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/MinHash-DSN'17.md b/StoragePaperNote/Deduplication/Secure-Dedup/MinHash-DSN'17.md
old mode 100644
new mode 100755
index 39201a4..705b19e
--- a/StoragePaperNote/Deduplication/Secure-Dedup/MinHash-DSN'17.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/MinHash-DSN'17.md
@@ -1,124 +1,124 @@
----
-typora-copy-images-to: paper_figure
----
- Information Leakage in Encrypted Deduplication via Frequency Analysis
-------------------------------------------
-| Venue | Category |
-| :----: | :------------------: |
-| DSN'17 | Secure Deduplication |
-[TOC]
-
-## 1. Background and Motivation
-- Encrypted deduplication seamlessly combines encryption and deduplication to simultaneously achieve both **data security** and **storage efficiency**.
-- The deterministic encryption inherently reveals the underlying frequency distribution of the original plaintext chunks (an adversary can launch frequency analysis and infer the content of the original plaintext chunks) 产生information leakage!
-- Conventional (symmetric) encryption is **incompatible** with deduplication. (基本都会说的老套路)
-- Existing MLE implementations still cannot fully protect against content leakage, mainly because their encryption approaches are **deterministic** and an adversary can analyze the frequency distribution of ciphertext chunks and infer the original plaintext chunks based on classical frequency analysis.
- > In practical storage workloads often exhibit **non-uniform** frequency distributions in terms of the occurrences of chunks with the same content.
- > 
-
-- A sinplest form of the attack:
-
- > 1. an adversary first obtains **prior knowledge** of frequency distributions of plaintext chunks.
- > 2. count the frequencies of all the ciphertext chunks
- > 3. infer their corresponding plaintext chunks based on the frequency distribution of ciphertext chunks.
- >
-
-- The *practical* implications of frequency analysis against encrypted deduplication remain **unexplored**.
-
-- **Chunk locality**: If a plaintext chunk $M$ corresponds to a ciphertext chunk $C$, then the neighboring plaintext chunks of $M$ are likely to correspond to the neighboring ciphertext chunks of $C$.
-
-- **Deduplication**: it is a coarse-grained compression technique to save storage space.
- > 1. After chunking, each chunk is identified by a *fingerprint*, which is computed from the cryptographic hash of the content of the chunk.
- > 2. Two non-identical chunks have the same fingerprint is practically **negligible**.
- > 3. To check if any identical chunk exists, the storage system maintians a *fingerprint index* (a **key-value** store that holds the mappings of all fingerprints to the address of physical chunks that are currently stored.)
- > 4. For each file, the storage system also stores a **file recipe** that lists the references to all chunks of the file for future reconstruction.
-
-- MLE can only achieve security for **unpredictable chunks**, meaning that the size of the set of chunks is **sufficiently large**, such that the brute-force attack becomes infeasible.
-
-##2. Threat Model
-- It focus on **backup workloads**, which have substantial content redundancy and are proven to be effective for deduplication in practice.
-- It assumes the adversary is **honest-but-curious**
- > 1. means it does not change the prescribed protocols for the storage system and modify any data in storage.
- > 2. To launch frequency analysis, the adversary should have access to *auxiliary information* that provides ground truths about the backups being stored.
- > 3. **auxiliary information**: the plaintext chunks of a prior (non-latest) backup, which may be obtained through **unintended data releases** or **data breaches**.
- > 4. The **primary goal** of the adversary is to *infer* the content of the plaintext content of the plaintext chunks that are mapped to the ciphertext chunks of the latest backup.
- >
-
-- Two attack modes:
- > 1. *Ciphertext-only mode*: the adversary can access the ciphertext of the latest backup.
- > 2. *Known-plaintext mode*: a **powerful** adversary can not only access the ciphertext chunks of the latest backup but also knows a small fraction of the ciphertext-plaintext chunk pairs about the latest backup.
- > Those two attack modes can also access the **logical order** of ciphertext chunks of the latest backup before deduplication.
- >
-
-## 3. Attack
-- It presents **inference attack** based on frequency analysis against encrypted deduplication.
-- The adversarial goal of the attack:
- > 1. Let $C = $ be the sequence of **ciphertext** chunks in logical order for the latest backup.
- > 2. Let $M=$ be the sequence of **plaintext** chunks in logical order for a prior backup.
- > 3. Note that both $C$ and $M$ do not necessarily have the **same number** of chunks.
- > 4. **Goal**: Given $C$ and $M$, an adversary aims to infer the content of the original plaintext chunks in $C$.
-
-### 3.1 Basic Attack
-- In basic attack, it firstly identifies each chunk by its fingerprint and **counts the frequency** of each chnk by the number of fingerprints that appear in a buckup.
-
- > a chunk has a high frequency if there exist many identical chunks with the same content (因为假设adversary是在参考deduplication之前的数据)
-
-- It secondly **sorts** the chunks of both $C$ and $M$ by their frquencies
-- Thirdly, it **infers** the $i-th$ frequent plaintext chunk in $M$ is the original plaintext chunk of the $i-th$ frequent ciphertext chunk in $C$. (因为是deterministic encryption的原因)
-
-- Algorithm~1 (**basic attack**):
- > 1. *input*: $C$ and $M$
- > 2. *return*: set $\Gamma$ of ciphertext-plaintext chunk **pairs**
- > 3. *FREQ-ANALYSIS* performs frequency analysis based on $F_C$ and $F_M$. Since $F_C$ and $F_M$ may not have the same number of elements, it finds **the minimum number** of elements in $F_C$ and $F_M$. Finally, it returns the ciphertext-plaintext chunk pairs, in which both the ciphertext and plaintext chunks of each pair have the **same rank**.
-
-- **Discussion**: The inference accuracy of *Basic Attack* is small because
- > 1. Basic Attack is sensitive to **data updates**, an update to a chunk can change the frequency ranks of multiple chunks, including the chunk itself and other chnks with similar frequencies.
- > 2. There exist many **ties**, in which chunks have the same frequency. How to break a tie during sorting also affects the frequency rank and hence the inference the accuracy of the tied chunk.
- >
-
-### 3.2 Locality-based Attack
-
-- *Locality-based Attack* exploits **chunk locality** to improve the severity of frequency analysis.
-- In their observation: if a plaintext chunk $M$ of a prior backup has been identified as the original plaintext chunk of a ciphertext chunk $C$ of the latest backup, then **the left and right neighbors of $M$ are also likely to be original plaintext chunks of the left and right neighbors of $C$.**
-
- > Because chunk locality implies that the ordering of chunks is likely to be **preserved** across backups
-
-- Why this attack has more severity?
-
- > 1. For any inferred ciphertext-plaintext chunk pair $(C,M)$, we further infer **more** ciphertext-plaintext chunk pairs through the left and right neighboring chunks of $C$ and $M$, and repeat the same inference on those newly inferred chunk pairs (通过这样的方式增加**attack severity**)
-
-- The locality-baed attack proceeds as follows: in **each iteration**
- > 1. it picks one ciphertext-plaintext chunk pair $(C, M)$ from $\xi$
- > 2. it collects the corresponding sets of neighboring chunks $L_C, L_M, R_C$ and $R_M$.
- > 3. it applys frequency analysis to find the most frequent ciphertext-plaintext chunk pairs from each of $L_C$ and $L_M$, and similarly from $R_C$ and $R_M$.
- > There are three factors need to configure:
- > 4. $v$: indicate the number of most frequent chunks pairs returned from frequent analysis in an iteration.
- > 5. $u$: indicate the number of most frequent ciphertext-plaintext chunk pairs to be returned.
- > 6. $w$: bound the maximum size of $\xi$.
-
-- The locality-based attack can successfully infer the original plaintext chunks of all ciphertext chunks. It **cannot** infer the original plaintext chunk that does not appear in $M$.
-
-## 4. Attack Evaluation
-- Implement both the basic and locality-based attacks in **C++**.
-- Associative array: as key-value stores using **LevelDB**.
-- Dataset
- > **FSL** is a real-world dataset collected by the File systems and Storage Lab (FSL).
- > Synthetic: an initial snapshot from Ubuntu 14.04 virtual disk image.
-
-- Quantify the severity of an attack using the **inference rate %**.
-- Impact of parameters
-- Inference rate in ciphertext-only mode
-- Inference rate in known-plaintext mode
-
-## 5. Defense
-- To defend agsinst frequency analysis, it considers a **MinHash Encryption Scheme**
-
- > Encrypts each copy of identifal plaintext chunks into **possibly different ciphertext chunks**, so as to hide the frequency distribution of original chunks.
-
-- MinHash encryption builds on **Broder's theorem**:
- > if two sets share a large fraction of common elements (i.e., they are highly similar), then the probability that both sets share the same minimum hash element is also high.
- > relax the deterministic nature of encrypted deduplication by **encrypting some identical plaintext chunks to multiple distinct ciphertext chunks.**
-
-## 6. Defense Evaluation
-- Robustness Against Leakage
+---
+typora-copy-images-to: paper_figure
+---
+ Information Leakage in Encrypted Deduplication via Frequency Analysis
+------------------------------------------
+| Venue | Category |
+| :----: | :------------------: |
+| DSN'17 | Secure Deduplication |
+[TOC]
+
+## 1. Background and Motivation
+- Encrypted deduplication seamlessly combines encryption and deduplication to simultaneously achieve both **data security** and **storage efficiency**.
+- The deterministic encryption inherently reveals the underlying frequency distribution of the original plaintext chunks (an adversary can launch frequency analysis and infer the content of the original plaintext chunks) 产生information leakage!
+- Conventional (symmetric) encryption is **incompatible** with deduplication. (基本都会说的老套路)
+- Existing MLE implementations still cannot fully protect against content leakage, mainly because their encryption approaches are **deterministic** and an adversary can analyze the frequency distribution of ciphertext chunks and infer the original plaintext chunks based on classical frequency analysis.
+ > In practical storage workloads often exhibit **non-uniform** frequency distributions in terms of the occurrences of chunks with the same content.
+ > 
+
+- A sinplest form of the attack:
+
+ > 1. an adversary first obtains **prior knowledge** of frequency distributions of plaintext chunks.
+ > 2. count the frequencies of all the ciphertext chunks
+ > 3. infer their corresponding plaintext chunks based on the frequency distribution of ciphertext chunks.
+ >
+
+- The *practical* implications of frequency analysis against encrypted deduplication remain **unexplored**.
+
+- **Chunk locality**: If a plaintext chunk $M$ corresponds to a ciphertext chunk $C$, then the neighboring plaintext chunks of $M$ are likely to correspond to the neighboring ciphertext chunks of $C$.
+
+- **Deduplication**: it is a coarse-grained compression technique to save storage space.
+ > 1. After chunking, each chunk is identified by a *fingerprint*, which is computed from the cryptographic hash of the content of the chunk.
+ > 2. Two non-identical chunks have the same fingerprint is practically **negligible**.
+ > 3. To check if any identical chunk exists, the storage system maintians a *fingerprint index* (a **key-value** store that holds the mappings of all fingerprints to the address of physical chunks that are currently stored.)
+ > 4. For each file, the storage system also stores a **file recipe** that lists the references to all chunks of the file for future reconstruction.
+
+- MLE can only achieve security for **unpredictable chunks**, meaning that the size of the set of chunks is **sufficiently large**, such that the brute-force attack becomes infeasible.
+
+##2. Threat Model
+- It focus on **backup workloads**, which have substantial content redundancy and are proven to be effective for deduplication in practice.
+- It assumes the adversary is **honest-but-curious**
+ > 1. means it does not change the prescribed protocols for the storage system and modify any data in storage.
+ > 2. To launch frequency analysis, the adversary should have access to *auxiliary information* that provides ground truths about the backups being stored.
+ > 3. **auxiliary information**: the plaintext chunks of a prior (non-latest) backup, which may be obtained through **unintended data releases** or **data breaches**.
+ > 4. The **primary goal** of the adversary is to *infer* the content of the plaintext content of the plaintext chunks that are mapped to the ciphertext chunks of the latest backup.
+ >
+
+- Two attack modes:
+ > 1. *Ciphertext-only mode*: the adversary can access the ciphertext of the latest backup.
+ > 2. *Known-plaintext mode*: a **powerful** adversary can not only access the ciphertext chunks of the latest backup but also knows a small fraction of the ciphertext-plaintext chunk pairs about the latest backup.
+ > Those two attack modes can also access the **logical order** of ciphertext chunks of the latest backup before deduplication.
+ >
+
+## 3. Attack
+- It presents **inference attack** based on frequency analysis against encrypted deduplication.
+- The adversarial goal of the attack:
+ > 1. Let $C = $ be the sequence of **ciphertext** chunks in logical order for the latest backup.
+ > 2. Let $M=$ be the sequence of **plaintext** chunks in logical order for a prior backup.
+ > 3. Note that both $C$ and $M$ do not necessarily have the **same number** of chunks.
+ > 4. **Goal**: Given $C$ and $M$, an adversary aims to infer the content of the original plaintext chunks in $C$.
+
+### 3.1 Basic Attack
+- In basic attack, it firstly identifies each chunk by its fingerprint and **counts the frequency** of each chnk by the number of fingerprints that appear in a buckup.
+
+ > a chunk has a high frequency if there exist many identical chunks with the same content (因为假设adversary是在参考deduplication之前的数据)
+
+- It secondly **sorts** the chunks of both $C$ and $M$ by their frquencies
+- Thirdly, it **infers** the $i-th$ frequent plaintext chunk in $M$ is the original plaintext chunk of the $i-th$ frequent ciphertext chunk in $C$. (因为是deterministic encryption的原因)
+
+- Algorithm~1 (**basic attack**):
+ > 1. *input*: $C$ and $M$
+ > 2. *return*: set $\Gamma$ of ciphertext-plaintext chunk **pairs**
+ > 3. *FREQ-ANALYSIS* performs frequency analysis based on $F_C$ and $F_M$. Since $F_C$ and $F_M$ may not have the same number of elements, it finds **the minimum number** of elements in $F_C$ and $F_M$. Finally, it returns the ciphertext-plaintext chunk pairs, in which both the ciphertext and plaintext chunks of each pair have the **same rank**.
+
+- **Discussion**: The inference accuracy of *Basic Attack* is small because
+ > 1. Basic Attack is sensitive to **data updates**, an update to a chunk can change the frequency ranks of multiple chunks, including the chunk itself and other chnks with similar frequencies.
+ > 2. There exist many **ties**, in which chunks have the same frequency. How to break a tie during sorting also affects the frequency rank and hence the inference the accuracy of the tied chunk.
+ >
+
+### 3.2 Locality-based Attack
+
+- *Locality-based Attack* exploits **chunk locality** to improve the severity of frequency analysis.
+- In their observation: if a plaintext chunk $M$ of a prior backup has been identified as the original plaintext chunk of a ciphertext chunk $C$ of the latest backup, then **the left and right neighbors of $M$ are also likely to be original plaintext chunks of the left and right neighbors of $C$.**
+
+ > Because chunk locality implies that the ordering of chunks is likely to be **preserved** across backups
+
+- Why this attack has more severity?
+
+ > 1. For any inferred ciphertext-plaintext chunk pair $(C,M)$, we further infer **more** ciphertext-plaintext chunk pairs through the left and right neighboring chunks of $C$ and $M$, and repeat the same inference on those newly inferred chunk pairs (通过这样的方式增加**attack severity**)
+
+- The locality-baed attack proceeds as follows: in **each iteration**
+ > 1. it picks one ciphertext-plaintext chunk pair $(C, M)$ from $\xi$
+ > 2. it collects the corresponding sets of neighboring chunks $L_C, L_M, R_C$ and $R_M$.
+ > 3. it applys frequency analysis to find the most frequent ciphertext-plaintext chunk pairs from each of $L_C$ and $L_M$, and similarly from $R_C$ and $R_M$.
+ > There are three factors need to configure:
+ > 4. $v$: indicate the number of most frequent chunks pairs returned from frequent analysis in an iteration.
+ > 5. $u$: indicate the number of most frequent ciphertext-plaintext chunk pairs to be returned.
+ > 6. $w$: bound the maximum size of $\xi$.
+
+- The locality-based attack can successfully infer the original plaintext chunks of all ciphertext chunks. It **cannot** infer the original plaintext chunk that does not appear in $M$.
+
+## 4. Attack Evaluation
+- Implement both the basic and locality-based attacks in **C++**.
+- Associative array: as key-value stores using **LevelDB**.
+- Dataset
+ > **FSL** is a real-world dataset collected by the File systems and Storage Lab (FSL).
+ > Synthetic: an initial snapshot from Ubuntu 14.04 virtual disk image.
+
+- Quantify the severity of an attack using the **inference rate %**.
+- Impact of parameters
+- Inference rate in ciphertext-only mode
+- Inference rate in known-plaintext mode
+
+## 5. Defense
+- To defend agsinst frequency analysis, it considers a **MinHash Encryption Scheme**
+
+ > Encrypts each copy of identifal plaintext chunks into **possibly different ciphertext chunks**, so as to hide the frequency distribution of original chunks.
+
+- MinHash encryption builds on **Broder's theorem**:
+ > if two sets share a large fraction of common elements (i.e., they are highly similar), then the probability that both sets share the same minimum hash element is also high.
+ > relax the deterministic nature of encrypted deduplication by **encrypting some identical plaintext chunks to multiple distinct ciphertext chunks.**
+
+## 6. Defense Evaluation
+- Robustness Against Leakage
- Storage Efficiency
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/MitigatingSideChannel-IPDPS'18.md b/StoragePaperNote/Deduplication/Secure-Dedup/MitigatingSideChannel-IPDPS'18.md
old mode 100644
new mode 100755
index 67f6a32..feb7bad
--- a/StoragePaperNote/Deduplication/Secure-Dedup/MitigatingSideChannel-IPDPS'18.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/MitigatingSideChannel-IPDPS'18.md
@@ -1,109 +1,109 @@
----
-typora-copy-images-to: ../paper_figure
----
-Mitigating Traffic-based Side Channel Attacks in Bandwidth-efficient Cloud Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| IPDPS'18 | Deduplication Side Channel Attack |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The occurrence of deduplication can be easily identified by monitoring and analyzing network traffic, which leads to the risk of user privacy leakage.
-> Learning-the-remaining-information (LRI) attack.
-
-Existing work addresses the LRI attack ath the cost of the high bandwidth consumption.
-
-This paper proposes **randomized redundant chunk scheme** (RRCS) to mitigate the risk of the LRI attack while maintaing the high bandwidth efficiency of deduplication.
-> add randomized redundant chunks to mix up the real deduplication states of files used for the LRI attack.
-> obfuscates the view of the attacker.
-
-This paper argues current cloud storage system typically perform **cross-user source-based** deduplication
-> for higher storage and bandwidth efficiency.
-
-### Randomized redundant chunk scheme
-- This paper focuses on the side channel of traffic information. (like existing work.)
-> the attacker could only infer/probe/ the privacy by observing the amount of the network traffic between the client and server.
-
-- LRI attack
-The goal of the adversary is to get the sensitive information
-> can be represented as a small number of bits and easily covered in one-chunk size(about) in the chunk level.
-> the number of possible versions of the target file is moderate
-
-- LRI attack in chunk-level deduplication
-> Three deduplication states for a file:
-> 1. full deduplication: do not upload any chunk
-> 2. partial deduplication: upload some chunks
-> 3. no deduplication: full upload
-
-The key idea of this paper is to leverage chunk-level redundancy rather than the whole-file redundancy to mix up the deduplication states of the target file state
-> via uploading som redundant chunks in each file.
-
-- Deterministic version
-1. For the file without non-duplicate chunks, i.e., the whole file is duplicate, it randomly choose one chunk of the file to upload.
-> is easily broken, can append one non-duplicate chunk in each file to broken the solution
-
-2. For the file with non-duplicate chunks, it directly uploads its non-duplicate chunks
-
-However, using deterministic chunk-level redundancy for defending against LRI attack may be easily broken via Appending Chunks attack (ACA)
-> may fail to mitigate the risk of LRI.
-
-
-- Randomized Redundant Chunk Scheme
-The basic idea of adding redundant chunks is to choose the number of the redundant chunks from a range uniformly at random.
-> weaken the correlation between the duplicated and the existing files in the server from the network traffic point of view.
-> the number of redundant chunks is chosen uniformly at random, $N$ is the total number of chunks in a file. $[0, \lambda N]$, $\lambda$ is a tradoff between the security and bandwidth efficiency.
-> provide the security analysis for the range, and prove there still exists the risk of privacy leakage.
-
-- Overview workflow
-RRCS is implemented in the server.
-1. the client first divides the file into chunks using chunking algorithm
-2. the client upload the fingerprints of all chunks to the server
-3. after receiving the fingerprints, the server can know the deduplication state of the file via querying the fingerprint index.
-4. the server randomly chooses $R$ duplicate chunks and non-duplicate chunks, and responds to the client.
-
-
-### Implementation and Evaluation
-
-- Evaluation
-1. Datasets
-> FSLhome, MaxOS, Onefull (ATC'11): not public
-> This paper argues that since the files havin few copies account fo a significant proportion, RTS of Danny Harnik becomes bandwidth-inefficient in the real-world datasets
-
-2. Bandwidth overhead
-Total traffic with the increase of the number of file uploads, compare with
-> 1. Target-based deduplication
-> 2. Source-based deduplication
-> 3. File-level RTS
-> 4. Chunk-level RTS
-> 5. RRCS
-
-
-## 2. Strength (Contributions of the paper)
-1. this paper proposes **randomized redundant chunk scheme (RRCS)** to mitigate the risk of the LRI attack in cloud storage, while maintaining the high efficiency of deduplication.
-
-2. It gives an in-depth security analysis for RRCS
-> all possible variants of the deduplication detection method are not effective in RRCS.
-
-3. Implement a prototype in deduplication system, and examine the real performance of RRCS by using multiple large-scale real-world datasets.
-
-
-## 3. Weakness (Limitations of the paper)
-1. This paper has a very strong assumption that the senstive information should be included in just one chunk. This is a little bit weak.
-
-
-## 4. Future Works
-1. This paper argues the LRI attack in deduplication is difficult to be addressed due to the following challenges:
-> CE or MLE has limitations: the attack could always carry out the LRI attack based on the side channel of network traffic to preceive whether deduplication occurs without probing the data themselves transmitted in the network.
-
-
-2. Current side channel attack defence is ineffiecent:
-> disable cross-user deduplication via encryption. (personal encryption key)
-> just perform target-based deduplication
-
-3. This paper argues that Harnik's work
-> cause huge bandwidth overhead due to upload redundant data.
-> has the risk of leaking privacy with a certain probability.
-
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Mitigating Traffic-based Side Channel Attacks in Bandwidth-efficient Cloud Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| IPDPS'18 | Deduplication Side Channel Attack |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The occurrence of deduplication can be easily identified by monitoring and analyzing network traffic, which leads to the risk of user privacy leakage.
+> Learning-the-remaining-information (LRI) attack.
+
+Existing work addresses the LRI attack ath the cost of the high bandwidth consumption.
+
+This paper proposes **randomized redundant chunk scheme** (RRCS) to mitigate the risk of the LRI attack while maintaing the high bandwidth efficiency of deduplication.
+> add randomized redundant chunks to mix up the real deduplication states of files used for the LRI attack.
+> obfuscates the view of the attacker.
+
+This paper argues current cloud storage system typically perform **cross-user source-based** deduplication
+> for higher storage and bandwidth efficiency.
+
+### Randomized redundant chunk scheme
+- This paper focuses on the side channel of traffic information. (like existing work.)
+> the attacker could only infer/probe/ the privacy by observing the amount of the network traffic between the client and server.
+
+- LRI attack
+The goal of the adversary is to get the sensitive information
+> can be represented as a small number of bits and easily covered in one-chunk size(about) in the chunk level.
+> the number of possible versions of the target file is moderate
+
+- LRI attack in chunk-level deduplication
+> Three deduplication states for a file:
+> 1. full deduplication: do not upload any chunk
+> 2. partial deduplication: upload some chunks
+> 3. no deduplication: full upload
+
+The key idea of this paper is to leverage chunk-level redundancy rather than the whole-file redundancy to mix up the deduplication states of the target file state
+> via uploading som redundant chunks in each file.
+
+- Deterministic version
+1. For the file without non-duplicate chunks, i.e., the whole file is duplicate, it randomly choose one chunk of the file to upload.
+> is easily broken, can append one non-duplicate chunk in each file to broken the solution
+
+2. For the file with non-duplicate chunks, it directly uploads its non-duplicate chunks
+
+However, using deterministic chunk-level redundancy for defending against LRI attack may be easily broken via Appending Chunks attack (ACA)
+> may fail to mitigate the risk of LRI.
+
+
+- Randomized Redundant Chunk Scheme
+The basic idea of adding redundant chunks is to choose the number of the redundant chunks from a range uniformly at random.
+> weaken the correlation between the duplicated and the existing files in the server from the network traffic point of view.
+> the number of redundant chunks is chosen uniformly at random, $N$ is the total number of chunks in a file. $[0, \lambda N]$, $\lambda$ is a tradoff between the security and bandwidth efficiency.
+> provide the security analysis for the range, and prove there still exists the risk of privacy leakage.
+
+- Overview workflow
+RRCS is implemented in the server.
+1. the client first divides the file into chunks using chunking algorithm
+2. the client upload the fingerprints of all chunks to the server
+3. after receiving the fingerprints, the server can know the deduplication state of the file via querying the fingerprint index.
+4. the server randomly chooses $R$ duplicate chunks and non-duplicate chunks, and responds to the client.
+
+
+### Implementation and Evaluation
+
+- Evaluation
+1. Datasets
+> FSLhome, MaxOS, Onefull (ATC'11): not public
+> This paper argues that since the files havin few copies account fo a significant proportion, RTS of Danny Harnik becomes bandwidth-inefficient in the real-world datasets
+
+2. Bandwidth overhead
+Total traffic with the increase of the number of file uploads, compare with
+> 1. Target-based deduplication
+> 2. Source-based deduplication
+> 3. File-level RTS
+> 4. Chunk-level RTS
+> 5. RRCS
+
+
+## 2. Strength (Contributions of the paper)
+1. this paper proposes **randomized redundant chunk scheme (RRCS)** to mitigate the risk of the LRI attack in cloud storage, while maintaining the high efficiency of deduplication.
+
+2. It gives an in-depth security analysis for RRCS
+> all possible variants of the deduplication detection method are not effective in RRCS.
+
+3. Implement a prototype in deduplication system, and examine the real performance of RRCS by using multiple large-scale real-world datasets.
+
+
+## 3. Weakness (Limitations of the paper)
+1. This paper has a very strong assumption that the senstive information should be included in just one chunk. This is a little bit weak.
+
+
+## 4. Future Works
+1. This paper argues the LRI attack in deduplication is difficult to be addressed due to the following challenges:
+> CE or MLE has limitations: the attack could always carry out the LRI attack based on the side channel of network traffic to preceive whether deduplication occurs without probing the data themselves transmitted in the network.
+
+
+2. Current side channel attack defence is ineffiecent:
+> disable cross-user deduplication via encryption. (personal encryption key)
+> just perform target-based deduplication
+
+3. This paper argues that Harnik's work
+> cause huge bandwidth overhead due to upload redundant data.
+> has the risk of leaking privacy with a certain probability.
+
+
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/PoWSGX-ACNS'20.md b/StoragePaperNote/Deduplication/Secure-Dedup/PoWSGX-ACNS'20.md
new file mode 100755
index 0000000..e7f2cfe
--- /dev/null
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/PoWSGX-ACNS'20.md
@@ -0,0 +1,80 @@
+---
+typora-copy-images-to: ../paper_figure
+---
+Proofs of Ownership on Encrypted Cloud Data via Intel SGX
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ACNS'20 | PoW |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - Traditional PoWs rely on an assumption that the cloud server is **fully** trusted and has access to the original file content.
+ - In practice, the cloud server is not fully trusted
+ - the data owners may store their encrypted data in the cloud
+ - hindering execution of the traditional PoWs.
+
+- Main idea
+ - create a trusted execution environment in a cloud server (Intel SGX)
+ - the critical component of the PoW verification will be executed in this secure environment
+
+
+### PoWIS
+- System model
+ - file-based client-side deduplication
+ - Two entities: the cloud server and the data owner
+
+- Theat model
+ - the cloud server is honest-but-curious
+ - tries to learn sensitive information from the encrypted file
+ - the malicious data owner wants to pass the PoW check on a file without actually possessing this file
+- communication channel is protected by SSL/TLS.
+
+- Main design
+ - The PoW verification process is separated and delegated to the SGX enclave.
+ - The decryption key for decrypting the encrypted cloud data and the PoW proof will be transmitted via a secure channel.
+ - remain confidential to the untrusted cloud server.
+ - the stored encrypted cloud data (will be decrypted in the secure enclave via the decryption key sent by the client)
+ - The enclave uses the session key $K$ to perform decryption to obtain $K_{mle}$ and the PoW proof.
+
+
+- Remote attestation
+ - allow the client to attest the enclave and to negotiate a *session key* to protect communication between the client and the enclave.
+ - The session key $K=g^{ab}$, $g^a$ is the public key share from the enclave, and $g^b$ is the public key share from the client.
+
+- Security analysis
+ - It simply uses SGX as a *black box*, which is assumed to be secure.
+ - A malicious client cannot pass the PoW check without possessing the original file.
+ - The cloud server cannot learn anything about the original file.
+### Implementation and Evaluation
+
+- Implementation
+ - sgx-ssl, sgx-sdk
+ - C language
+
+- Performance
+ - RA breakdown
+ - PoW proof generation and verification time
+
+
+## 2. Strength (Contributions of the paper)
+1. PoWIS is the first secure PoW protocol designed for encrypted cloud data.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. an immediate remediation is to ask potential data owner to first encrypt the original file, and then compute the PoW proof over the encrypted file.
+> this can only ensure that prover really owns the encrypted file, instead of the original file.
+
+2. Original PoW: Merkle tree is first constructed over a file, and resulting Merkle root will be stored by the cloud server.
+> the cloud server will issue a challenge to the client.
+
+3. Remote Attestation
+Via the RA, the client can ensure that the enclave is running on the remote cloud server and executions inside the enclave are trustworthy.
+> a secure channel can be established between the client and the enclave at the same time. (allow the client to communicate with the enclave directly)
+
+
+4. Performance consideration
+By randomly checking a certain number of file blocks (e.g., 460), the cloud server can detect this misbehavior with a high probability (e.g., 99%)
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/PrivacyPreservingDedup-CLOUD'17.md b/StoragePaperNote/Deduplication/Secure-Dedup/PrivacyPreservingDedup-CLOUD'17.md
old mode 100644
new mode 100755
index f9ceb80..e13148f
--- a/StoragePaperNote/Deduplication/Secure-Dedup/PrivacyPreservingDedup-CLOUD'17.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/PrivacyPreservingDedup-CLOUD'17.md
@@ -1,105 +1,105 @@
----
-typora-copy-images-to: ../paper_figure
----
-Privacy-Preserving Data Deduplication on Trusted Processors
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CLOUD'17 | Deduplication SGX |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Revealing ownership and equality information of the outsourced data to untrustworthy parties has serious privacy implications.
-
-- This paper leverages the SGX to design a privacy-preserving data deduplication protocol that protects not only the confidentiality of the outsourced data, but also their ownership and equality information.
-
-### SGX Deduplication
-- Basis of SGX
-1. SGX provisions the protected execution environments (a.k.a, enclaves). Each enclave is associated with a region on physical memory
-> 90MB
-> code and data inside the enclave is protected by the processor
-> enclave may access the enclave memory as well as memory outside the enclave region.
-
-2. Enclaves cannot directly execute OS-provided services such as I/O.
-> a communication channel between the enclave code and the untrusted environment (e.g., OS) is required to service OS-provided functions.
-
-3. Each such SGX processor has a unique secret burned into its fusion.
-> refer it as **seal key**.
-
-- Architecture
- - Proxy-based Architecture (three-tier setting)
- > cloud storage provider, the local proxy, the client
-
- - Duplicates uploaded by affiliated clients are detected and removed at **proxy**
- - Cross-enterprise deduplication is performed on **Storage sever S**
-client-proxy-storage server
-> offers almost similar bandwidth savings to that of client-side deduplication solutions, but does not admit the leakage wherein a client can learn if a file is already stored on the server.
-
-> **Storage server $S$** and **enterprise proxies $P$** are equipped with SGX-enabled processors.
-
-- Threat Model
-The attacker can gain complete control over the operating systems and other privileged softwares of storage servers and proxies.
-> However, the attacker cannot violate SGX guarantees.
-
-- Design Goal
- - Data confidentiality
- - Ownership information
- - Equality information
- - Performance requirements: keeping communication and computational overhead low.
-
-- Whole Design
-1. CUpload (client)
-> client interacts with **PEnclave** to use blind signature scheme to get the encryption key.
-> file-level deduplication, using **deterministic encryption**.
-> Rate-limiting
-
-2. PDedup (proxy)
-> perform a **privacy-preserving compaction** to remove duplicate, then uploads the deduplicated data to $S$.
-> prevent traffic analysis: pads the traffic from PEnclave to SEnclave following **differential privacy**.
->
-> > translate the Laplace noise to a corresponding number of chunks.
-
-### Implementation and Evaluation
-- Implementation
- - around 1000 LOC C code, 90MB enclave
- - Intel Skylake processors
-
-- Evaluation
- - Dataset: Debian Popularity Contest (treat each package as a file and its installation as an upload of that file)
- - Goal: to show the performance overhead is low.
-
-1. upload latency (vary file size)
-> compare with uploading the plain files of the same size.
-
-2. break down
-> encryption time, key derive time
-
-
-## 2. Strength (Contributions of the paper)
-
-1. It proposes three-tier deduplication architecture
-> save the bandwidth, yet does not admit the client-side deduplication leakage on file existence
-
-2. leverage SGX in proxy and storage server to protect **confidentiality**, **ownership** and **equality information** of the outsourced data against various adversaries.
-
-3. implement a prototype and conduct experiments to show it incur low overhead
-> in comparison with conventional deduplication solutions
-
-
-## 3. Weakness (Limitations of the paper)
-1. This method gives up the storage efficiency since it only consider two-stage deduplication and file-level deduplication, both of them are not exact deduplication. This paper can provide a deep analysis to show the loss of deduplication ratio.
-
-## 4. Some Useful Insights
-
-1. This paper argues that encryption only protects the confidentiality of the files at rest, while sensitive information can still be inferred from their metadata (e.g., ownership and equality information)
-> How to combine SGX with encrypted deduplication?
-
-2. This papers use **privacy-preserving compaction** to realize data deduplication instead of table lookup. This provides a new way to achieve data deduplication in SGX environment.
-
-3. This paper mentions the issue that
-> successful retrieval of an outsourced record reveals its ownership information.
-
-It can further prevent this leakage by using Oblivious RAM or Private Information retrieval.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Privacy-Preserving Data Deduplication on Trusted Processors
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| CLOUD'17 | Deduplication SGX |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Revealing ownership and equality information of the outsourced data to untrustworthy parties has serious privacy implications.
+
+- This paper leverages the SGX to design a privacy-preserving data deduplication protocol that protects not only the confidentiality of the outsourced data, but also their ownership and equality information.
+
+### SGX Deduplication
+- Basis of SGX
+1. SGX provisions the protected execution environments (a.k.a, enclaves). Each enclave is associated with a region on physical memory
+> 90MB
+> code and data inside the enclave is protected by the processor
+> enclave may access the enclave memory as well as memory outside the enclave region.
+
+2. Enclaves cannot directly execute OS-provided services such as I/O.
+> a communication channel between the enclave code and the untrusted environment (e.g., OS) is required to service OS-provided functions.
+
+3. Each such SGX processor has a unique secret burned into its fusion.
+> refer it as **seal key**.
+
+- Architecture
+ - Proxy-based Architecture (three-tier setting)
+ > cloud storage provider, the local proxy, the client
+
+ - Duplicates uploaded by affiliated clients are detected and removed at **proxy**
+ - Cross-enterprise deduplication is performed on **Storage sever S**
+client-proxy-storage server
+> offers almost similar bandwidth savings to that of client-side deduplication solutions, but does not admit the leakage wherein a client can learn if a file is already stored on the server.
+
+> **Storage server $S$** and **enterprise proxies $P$** are equipped with SGX-enabled processors.
+
+- Threat Model
+The attacker can gain complete control over the operating systems and other privileged softwares of storage servers and proxies.
+> However, the attacker cannot violate SGX guarantees.
+
+- Design Goal
+ - Data confidentiality
+ - Ownership information
+ - Equality information
+ - Performance requirements: keeping communication and computational overhead low.
+
+- Whole Design
+1. CUpload (client)
+> client interacts with **PEnclave** to use blind signature scheme to get the encryption key.
+> file-level deduplication, using **deterministic encryption**.
+> Rate-limiting
+
+2. PDedup (proxy)
+> perform a **privacy-preserving compaction** to remove duplicate, then uploads the deduplicated data to $S$.
+> prevent traffic analysis: pads the traffic from PEnclave to SEnclave following **differential privacy**.
+>
+> > translate the Laplace noise to a corresponding number of chunks.
+
+### Implementation and Evaluation
+- Implementation
+ - around 1000 LOC C code, 90MB enclave
+ - Intel Skylake processors
+
+- Evaluation
+ - Dataset: Debian Popularity Contest (treat each package as a file and its installation as an upload of that file)
+ - Goal: to show the performance overhead is low.
+
+1. upload latency (vary file size)
+> compare with uploading the plain files of the same size.
+
+2. break down
+> encryption time, key derive time
+
+
+## 2. Strength (Contributions of the paper)
+
+1. It proposes three-tier deduplication architecture
+> save the bandwidth, yet does not admit the client-side deduplication leakage on file existence
+
+2. leverage SGX in proxy and storage server to protect **confidentiality**, **ownership** and **equality information** of the outsourced data against various adversaries.
+
+3. implement a prototype and conduct experiments to show it incur low overhead
+> in comparison with conventional deduplication solutions
+
+
+## 3. Weakness (Limitations of the paper)
+1. This method gives up the storage efficiency since it only consider two-stage deduplication and file-level deduplication, both of them are not exact deduplication. This paper can provide a deep analysis to show the loss of deduplication ratio.
+
+## 4. Some Useful Insights
+
+1. This paper argues that encryption only protects the confidentiality of the files at rest, while sensitive information can still be inferred from their metadata (e.g., ownership and equality information)
+> How to combine SGX with encrypted deduplication?
+
+2. This papers use **privacy-preserving compaction** to realize data deduplication instead of table lookup. This provides a new way to achieve data deduplication in SGX environment.
+
+3. This paper mentions the issue that
+> successful retrieval of an outsourced record reveals its ownership information.
+
+It can further prevent this leakage by using Oblivious RAM or Private Information retrieval.
+
> how to mitigate its non-trivial performance overhead
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/RARE-INFOCOM'18.md b/StoragePaperNote/Deduplication/Secure-Dedup/RARE-INFOCOM'18.md
old mode 100644
new mode 100755
index f1f9018..b50e1d7
--- a/StoragePaperNote/Deduplication/Secure-Dedup/RARE-INFOCOM'18.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/RARE-INFOCOM'18.md
@@ -1,82 +1,82 @@
----
-typora-copy-images-to: ../paper_figure
----
-RARE: Defeating Side Channels based on Data-Deduplication in Cloud Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| INFOCOM'18 | Deduplication Side Channel |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-The commercial cloud storage services are in favor of the **cross-user client-side fixed-size-chunk-level** data deduplication to reach the highest deduplication gain.
-> the user uploads the file hash as the duplication check request
-> the cloud may check the status of file existence and then sends a binary duplication check reponse (dc response) to the user (suppress the explicit uploading of the file when the deduplication occurs)
-
-The threat from side-channel
-> the user needs a **deterministic response** from the cloud to know whether the further uploading of the file is necessary.
-
-- Key point
-Any deterministic response can be seen as an indicator of privacy leakage.
-
-
-### RARE
-- Key point
-the privacy leakage of side channel is due to the **deterministic relation** between duplication check request (dc request), duplication check response (dc response)
-> this work intends to reach the probabilistic relation by allowing the cloud to randomize the dc response.
-> keep the deduplication gain to the certain degree and eliminate the leakage of chunk existence status.
-
-- Main idea
-1. duplicate check of single chunk does not give sufficient room for dc response randomization, this paper performs the duplicate check on two chunks at once
-2. dirty chunks
-the chunks have been queried but not uploaded eventually can be exploited to perform repeated duplicate checks
-
-
-- Privacy notion
-1. existence privacy and inexistence privacy
-> dc response does not give any extra information about the existence status of a determined chunk.
-
-2. weaker version of existence privacy
-$P[C|f(c,aux)] = P[C] = 1/2$
-
-
-- Check double chunks
-In order to hide the chunk existence status, RARE carries out the encodings on both the dc response and the chunks to be uploaded.
-
-
-
-- Dirty chunk list
-RARE prevents the case that the attacker performs duplicate check but does not upload queried chunks
-> it implements a dirty chunk list to keep all hashes of chunks that have been queried but are not uploaded eventually.
-
-
-- Security analysis
-This work aims to achieve the inexistence privacy and weak existence privacy.
-
-### Implementation and Evaluation
-- Evaluation
-1. Dataset
-Enrom Email dataset
-
-2. communication cost
-the number of bits required during the entire chunk uploading process
-> duplicate check (dc response) and explicit chunk uploading (chunk)
-
-
-## 2. Strength (Contributions of the paper)
-1. Parameterless configuration
-RARE does not have the parameters that need to be determined manually.
-> relieve the burden for engineers
-
-2. No independent server
-RARE only involves the interactions between the user and cloud.
-
-
-## 3. Weakness (Limitations of the paper)
-1. the use of dirty chunks actually compromises the deduplication benefit. All of the dc requests relevant to dirty chunks will not trigger deduplication.
-
-
-## 4. Future Works
-1. the random response scheme is a typical method to defend the side channel attack in client side deduplication.
+---
+typora-copy-images-to: ../paper_figure
+---
+RARE: Defeating Side Channels based on Data-Deduplication in Cloud Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| INFOCOM'18 | Deduplication Side Channel |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+The commercial cloud storage services are in favor of the **cross-user client-side fixed-size-chunk-level** data deduplication to reach the highest deduplication gain.
+> the user uploads the file hash as the duplication check request
+> the cloud may check the status of file existence and then sends a binary duplication check reponse (dc response) to the user (suppress the explicit uploading of the file when the deduplication occurs)
+
+The threat from side-channel
+> the user needs a **deterministic response** from the cloud to know whether the further uploading of the file is necessary.
+
+- Key point
+Any deterministic response can be seen as an indicator of privacy leakage.
+
+
+### RARE
+- Key point
+the privacy leakage of side channel is due to the **deterministic relation** between duplication check request (dc request), duplication check response (dc response)
+> this work intends to reach the probabilistic relation by allowing the cloud to randomize the dc response.
+> keep the deduplication gain to the certain degree and eliminate the leakage of chunk existence status.
+
+- Main idea
+1. duplicate check of single chunk does not give sufficient room for dc response randomization, this paper performs the duplicate check on two chunks at once
+2. dirty chunks
+the chunks have been queried but not uploaded eventually can be exploited to perform repeated duplicate checks
+
+
+- Privacy notion
+1. existence privacy and inexistence privacy
+> dc response does not give any extra information about the existence status of a determined chunk.
+
+2. weaker version of existence privacy
+$P[C|f(c,aux)] = P[C] = 1/2$
+
+
+- Check double chunks
+In order to hide the chunk existence status, RARE carries out the encodings on both the dc response and the chunks to be uploaded.
+
+
+
+- Dirty chunk list
+RARE prevents the case that the attacker performs duplicate check but does not upload queried chunks
+> it implements a dirty chunk list to keep all hashes of chunks that have been queried but are not uploaded eventually.
+
+
+- Security analysis
+This work aims to achieve the inexistence privacy and weak existence privacy.
+
+### Implementation and Evaluation
+- Evaluation
+1. Dataset
+Enrom Email dataset
+
+2. communication cost
+the number of bits required during the entire chunk uploading process
+> duplicate check (dc response) and explicit chunk uploading (chunk)
+
+
+## 2. Strength (Contributions of the paper)
+1. Parameterless configuration
+RARE does not have the parameters that need to be determined manually.
+> relieve the burden for engineers
+
+2. No independent server
+RARE only involves the interactions between the user and cloud.
+
+
+## 3. Weakness (Limitations of the paper)
+1. the use of dirty chunks actually compromises the deduplication benefit. All of the dc requests relevant to dirty chunks will not trigger deduplication.
+
+
+## 4. Future Works
+1. the random response scheme is a typical method to defend the side channel attack in client side deduplication.
> how to design a tunable random response scheme to balance the overhead of given up storage efficiency and security gain.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/SideChannel-S&P'10.md b/StoragePaperNote/Deduplication/Secure-Dedup/SideChannel-S&P'10.md
old mode 100644
new mode 100755
index 5fe04f2..18db603
--- a/StoragePaperNote/Deduplication/Secure-Dedup/SideChannel-S&P'10.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/SideChannel-S&P'10.md
@@ -1,84 +1,84 @@
----
-typora-copy-images-to: paper_figure
----
-Side Channels in Cloud Services, the Case of Deduplication in Cloud Storage
-------------------------------------------
-| Venue | Category |
-| :----: | :------------------: |
-| S&P'10 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper studies the privacy implications of cross-user deduplication. It shows that in different scenario, deduplication can be used as a covert channel by which malicious software can communicate with its command and control center.
-> regardless of any firewall settings at the attacked machine.
-
-### Side Channel Attacks and Defenses
-- Two features of the deduplication services for attacks
-1. Source-based deduplication:
-the client can observe whether a certain file or block was deduplicated.
-> by examining the amount of data transferred over the network.
-
-2. Cross-user deduplication:
-> each file or block is compared to the data of other users.
-
-- It is easy to identigy whether a storage service is source-based and cross-user deduplication
-1. installing two clients with different accounts
-> upload or download the popular files, and check whether it is re-transmitted.
-> e.g. Dropbox, Mozy,
-
-2. It is easy to test
-> monitor network traffic and measure the amount of transmitted data.
-
-3. Can query any information
-> Did any user previously upload a copy of this file?
-
-- Attack 1: Identifying files
-identifying whether a specific file, known to the attacker, was previously uploaded to the storage service.
-
-- Attack 2: learning the contents of files
-the attacker might apply Attack 1 to multiple version of the same file, performing a brute force attack over all possible values of the content of the file.
-> this attack can be applied whenever the number of **possible version of the target file is moderate**.
-
-- Attack 3: A Covert Channel
-a malicious user can leverage the deduplication attack to establish a covert channel from the malicious software to a remote control center run by that malicious user.
-> generate multiple versions of a file, and upload to the cloud storage, another client can use it to identify the transformed inforamtion.
-> the covert channel can be used to transfer arbitrarily long messages by having the software save more than single file.
-
-- Solution 1: Using encryption to stop deduplication
-1. Instead of using a global key, stop cross-user deduplication, ensures the encryption method is not deterministic.
-> using personal encryption key (costly to service provider, prevent usage of deduplication, key management issue)
-> susceptible to offline dictionary attacks against that key. (key is generated from a small domain)
-
-2. the services generate the encryption keys
-
-
-- Solution 2: Performing deduplication at the servers
-1. A major drawback: eliminates all bandwidth savings of deduplication, and service provide or user must pay for transferring the raw amount of data.
-2. A trade-off between bandwidth and privacy
-> small size files: always upload
-> larger files: source-based deduplication
-> assume: sensetive data are always in small size files
-
-
-- Solution 3: A Randomized Solution
-1. weak the corelation between deduplication and the existence of files in the storage service.
-> set a random threshold $T$: perform deduplication when the number of copies of the file exceeds this threshold.
-> Attacker: can test $T$ via uploading many copies of the file.
-
-2. A truly random solution
-> It is important to achieve that no one except for the server can compute $T$. And this can be achieve by using a private server.
-> it can also use a secret key $s$, and computing the threshold as a function of the contents of the file. $T = F(X, s)$
-> This solution can still adopt the sever-side deduplication
-
-
-## 2. Strength (Contributions of the paper)
-1. This paper provides three kinds of attacks for online deduplication system which are very practical.
-## 3. Weakness (Limitations of the paper)
-1. For the analysis part of whether the occurrence of deduplcation adds any information about whether a file was uploaded to the server or not, it is not very easy to understand, I think maybe something wrong in this part.
-2. No experiment to show the result in real case, just analysis.
-## 4. Future Works
-This paper shows that the usage of convergent encryption does not solve the security risks of data leakage.
-> Since an adversary can still identify the occurence of deduplication.
-
+---
+typora-copy-images-to: paper_figure
+---
+Side Channels in Cloud Services, the Case of Deduplication in Cloud Storage
+------------------------------------------
+| Venue | Category |
+| :----: | :------------------: |
+| S&P'10 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper studies the privacy implications of cross-user deduplication. It shows that in different scenario, deduplication can be used as a covert channel by which malicious software can communicate with its command and control center.
+> regardless of any firewall settings at the attacked machine.
+
+### Side Channel Attacks and Defenses
+- Two features of the deduplication services for attacks
+1. Source-based deduplication:
+the client can observe whether a certain file or block was deduplicated.
+> by examining the amount of data transferred over the network.
+
+2. Cross-user deduplication:
+> each file or block is compared to the data of other users.
+
+- It is easy to identigy whether a storage service is source-based and cross-user deduplication
+1. installing two clients with different accounts
+> upload or download the popular files, and check whether it is re-transmitted.
+> e.g. Dropbox, Mozy,
+
+2. It is easy to test
+> monitor network traffic and measure the amount of transmitted data.
+
+3. Can query any information
+> Did any user previously upload a copy of this file?
+
+- Attack 1: Identifying files
+identifying whether a specific file, known to the attacker, was previously uploaded to the storage service.
+
+- Attack 2: learning the contents of files
+the attacker might apply Attack 1 to multiple version of the same file, performing a brute force attack over all possible values of the content of the file.
+> this attack can be applied whenever the number of **possible version of the target file is moderate**.
+
+- Attack 3: A Covert Channel
+a malicious user can leverage the deduplication attack to establish a covert channel from the malicious software to a remote control center run by that malicious user.
+> generate multiple versions of a file, and upload to the cloud storage, another client can use it to identify the transformed inforamtion.
+> the covert channel can be used to transfer arbitrarily long messages by having the software save more than single file.
+
+- Solution 1: Using encryption to stop deduplication
+1. Instead of using a global key, stop cross-user deduplication, ensures the encryption method is not deterministic.
+> using personal encryption key (costly to service provider, prevent usage of deduplication, key management issue)
+> susceptible to offline dictionary attacks against that key. (key is generated from a small domain)
+
+2. the services generate the encryption keys
+
+
+- Solution 2: Performing deduplication at the servers
+1. A major drawback: eliminates all bandwidth savings of deduplication, and service provide or user must pay for transferring the raw amount of data.
+2. A trade-off between bandwidth and privacy
+> small size files: always upload
+> larger files: source-based deduplication
+> assume: sensetive data are always in small size files
+
+
+- Solution 3: A Randomized Solution
+1. weak the corelation between deduplication and the existence of files in the storage service.
+> set a random threshold $T$: perform deduplication when the number of copies of the file exceeds this threshold.
+> Attacker: can test $T$ via uploading many copies of the file.
+
+2. A truly random solution
+> It is important to achieve that no one except for the server can compute $T$. And this can be achieve by using a private server.
+> it can also use a secret key $s$, and computing the threshold as a function of the contents of the file. $T = F(X, s)$
+> This solution can still adopt the sever-side deduplication
+
+
+## 2. Strength (Contributions of the paper)
+1. This paper provides three kinds of attacks for online deduplication system which are very practical.
+## 3. Weakness (Limitations of the paper)
+1. For the analysis part of whether the occurrence of deduplcation adds any information about whether a file was uploaded to the server or not, it is not very easy to understand, I think maybe something wrong in this part.
+2. No experiment to show the result in real case, just analysis.
+## 4. Future Works
+This paper shows that the usage of convergent encryption does not solve the security risks of data leakage.
+> Since an adversary can still identify the occurence of deduplication.
+
**Importantance**: This paper also mentions that it is possible to choose the threshold according to different distributions to minimize the cost while maximizing security.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/SideChannelTradeOffs-AsiaCCS'17.md b/StoragePaperNote/Deduplication/Secure-Dedup/SideChannelTradeOffs-AsiaCCS'17.md
old mode 100644
new mode 100755
index 2c2637f..10af829
--- a/StoragePaperNote/Deduplication/Secure-Dedup/SideChannelTradeOffs-AsiaCCS'17.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/SideChannelTradeOffs-AsiaCCS'17.md
@@ -1,57 +1,57 @@
----
-typora-copy-images-to: ../paper_figure
----
-Side Channels in Deduplication: Trade-offs between Leakage and Efficiency
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ASIA CCS'17 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Cross-user client-side deduplication inherently gives the adversary access to a side-channel that may divulge whether or not a particular file is stored on the server, leading to leakage of user information.
-
-This paper proposes formal definitions for deduplication strategies and their security in terms of adversarial advantage.
-> provide a criterion for designing good strategies and then prove a bound characterizing the necessary **trade-off** between security and efficiency.
-
-### Method Name
-Client-side deduplication is generally preferable to server-side deduplication on economic grounds. For Danny Harnik's work, it can be seen as a compromise between the efficiency of client-side deduplication and the security of server-side deduplication.
-> To simplify its results, it focuses on file-based deduplication.
-
-- The trade-off between security and efficiency
-In Danny Harnik's work, it can control the threshold to achieve the trade-off between security and efficiency. Because the threshold indicates the the maximum number of times a file may need to be uploaded and is thus the worst-case overhead for bandwidth.
-
-- Modeling of deduplication
-Here, it may wish to implement a deduplication strategy that choose the upload threshold based on some probability distribution.
-> Ideally, reduce an adversary's ability to gain information from its uploads, in a way that does not severely impact the amount of bandwidth required.
-
-In this paper, it regards **deduplication strategies** as $\rightarrow$ **distributions on the possible thresholds**.
-> a strategy $DS$ can be viewed as the list ($p_0 = 0, p_1, ....$) where $p_i$ is the probability that the threshold is value $i$.
-> $DS$ is a probability mass function. $DS.Alg$ is the algorithm that implements strategy DS.
-
-
-- Security Notion of Existence-of-File Attack
-This paper introduces the indistinguishably under existence-of-file attack (IND-EFA)
-> game
-
-Given a deduplication strategy, since the adversary's job is essentially to distinguish two probability distributions, it defines the statistical distance of the two distributions, called this **security level** $\Delta$.
-$$
-\Delta =\sum^{\infty}_{i=0} |p_i - p_{i+1}|
-$$
-
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. this paper proposes formal definitions for side-channel deduplication strategies, including a natural measure for effectiveness of countermeasures.
-2. characterizing the trade-off between security and efficiency necessary for different strategies.
-
-## 3. Weakness (Limitations of the paper)
-1. This paper is too theoretical, and does not provide any material related to experiments.
-
-## 4. Future Works
-1. This paper shows that **uniform distribution for probabilistic uploads** provides the optimal solution for a natural measure, which presents a trade-off between security and bandwidth usages.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Side Channels in Deduplication: Trade-offs between Leakage and Efficiency
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ASIA CCS'17 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Cross-user client-side deduplication inherently gives the adversary access to a side-channel that may divulge whether or not a particular file is stored on the server, leading to leakage of user information.
+
+This paper proposes formal definitions for deduplication strategies and their security in terms of adversarial advantage.
+> provide a criterion for designing good strategies and then prove a bound characterizing the necessary **trade-off** between security and efficiency.
+
+### Method Name
+Client-side deduplication is generally preferable to server-side deduplication on economic grounds. For Danny Harnik's work, it can be seen as a compromise between the efficiency of client-side deduplication and the security of server-side deduplication.
+> To simplify its results, it focuses on file-based deduplication.
+
+- The trade-off between security and efficiency
+In Danny Harnik's work, it can control the threshold to achieve the trade-off between security and efficiency. Because the threshold indicates the the maximum number of times a file may need to be uploaded and is thus the worst-case overhead for bandwidth.
+
+- Modeling of deduplication
+Here, it may wish to implement a deduplication strategy that choose the upload threshold based on some probability distribution.
+> Ideally, reduce an adversary's ability to gain information from its uploads, in a way that does not severely impact the amount of bandwidth required.
+
+In this paper, it regards **deduplication strategies** as $\rightarrow$ **distributions on the possible thresholds**.
+> a strategy $DS$ can be viewed as the list ($p_0 = 0, p_1, ....$) where $p_i$ is the probability that the threshold is value $i$.
+> $DS$ is a probability mass function. $DS.Alg$ is the algorithm that implements strategy DS.
+
+
+- Security Notion of Existence-of-File Attack
+This paper introduces the indistinguishably under existence-of-file attack (IND-EFA)
+> game
+
+Given a deduplication strategy, since the adversary's job is essentially to distinguish two probability distributions, it defines the statistical distance of the two distributions, called this **security level** $\Delta$.
+$$
+\Delta =\sum^{\infty}_{i=0} |p_i - p_{i+1}|
+$$
+
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. this paper proposes formal definitions for side-channel deduplication strategies, including a natural measure for effectiveness of countermeasures.
+2. characterizing the trade-off between security and efficiency necessary for different strategies.
+
+## 3. Weakness (Limitations of the paper)
+1. This paper is too theoretical, and does not provide any material related to experiments.
+
+## 4. Future Works
+1. This paper shows that **uniform distribution for probabilistic uploads** provides the optimal solution for a natural measure, which presents a trade-off between security and bandwidth usages.
+
2. This paper also mentions the topic of security in **memory deduplication**. It says if KSM module were to use randomized thresholds for deduplication of memory pages, then the tradeoff between efficiency and security is very similar to the cloud storage scenario.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/TappingPotential-CNS'18.md b/StoragePaperNote/Deduplication/Secure-Dedup/TappingPotential-CNS'18.md
old mode 100644
new mode 100755
index 4863ea4..0aaeee6
--- a/StoragePaperNote/Deduplication/Secure-Dedup/TappingPotential-CNS'18.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/TappingPotential-CNS'18.md
@@ -1,112 +1,112 @@
----
-typora-copy-images-to: paper_figure
----
-Tapping the Potential: Secure Chunk-based Deduplication of Encrypted Data for Cloud Backup
-------------------------------------------
-| Venue | Category |
-| :---------: | :------------------: |
-| IEEE CNS'18 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Existing the secure deduplication designs are at odds with the real-world dedupe requirements in terms of security and performance. (pay little attention to the challenges and practical requirements of chunk-based deduplication (CBD))
-
-
-The practical requirements:
-1. Low-entropy chunk: brute-force attack for predictable files. CBD will amplify the attack efficacy due to the potentially much lower entropy contained in a small chunk.
-> Question: how to reduce the risk of the information leakage with minimal impact on the underlying deduplication routine?
-
-2. Increased system operation overhead: incurs higher latency and computation overhead, since the client needs to run the key generation protocol with other online parties (a key server or peer clients)
-> Question: can it speed up the key generation while still ensuring an effective deduplication function?
-
-3. Practical dedupe performance: in addition to the deduplication ratio, it also needs to concern chunk fragmentation level, or restore speed.
-A high level fragmentation level typically adversely affects the system read performance and further increase the data restore cost.
-> Question: any secure chunk-level dedupe design should provide a read performance on par with plaintext CBD practice.
-
-
-This paper intends to solve those three challenges.
-
-### Tapping the Potential
-- Main idea: using randomized encryption will completely protect the low-entropy chunks
-> will outright incapacitate the deduplication, this is the desired asymmetry between security and performance, but only with **minimal dedupe performance loss**.
-
-- System Model
-This paper focuses on protecting the confidentiality of predictable data
-> It assumes it can achieve the semantic security for unpredictable data with CE.
-
-All the communication channels between clients, the key server, and the backup storage system are secure.
-
-A ideal functionality $F_{dedupe}$:
-> 1. input - client: chunk $ch$, key server: a chosen secret $d_t$
-> 2. output - client: the chunk key $k$, key server: a sign, storage server: the ciphertext of chunk
-
-Goal: a probabilistic polynomial-time (PPT) adversary cannot distinguish the real-world execution of the proposed scheme.
-
-- Randomized Oblivious Key Generation (ROKG)
-Chunk encryption key can be generated by running a secure (oblivious) two-party computation between the client and the key server.
-> the key server learns nothing on the client's input and algorithm output
-> client cannot infer key server's secret
-
-Some points:
-1. a chunk obfuscation algorithm takes as input the chunk data $ch$, a random number $r$, then outputs the obfuscated chunk data $z$.
-2. The modified version of blind RSA signature
-3. In the system setup phase, the key server would generate the $n$ pairs of RSA sets, for each user, it would choose only one from the this set and blind this with the user's ID.
-
-The key design is it uses $n$ pairs of RSA, so that given any compromised client $C_j$ out of $s$ clients in the system
-> the adversary can only infer at most $\frac{s}{n}$ client's data on storage servers. ($s \geq n$)
-
-It can tweak the parameter $n$ to accommodate the real network scale.
-> the number of compromised machines depends on company's size but also
-
-- Impact on deduplication
-Model:
-the store data: $x+y$
-> $x$: the size of data that cannot be deduplicated across all the users
-> $y$: the size of data that have been deduplicated
-> degradation ratio: $\frac{x+y}{x+ny}$
-
-This can proves that when $x$ outsizes $y$ significantly, selecting a small or moderate $n$ will not introduce an obvious performance penalty.
-> the workload is deduplication-unfriendly.
-
-- Efficiency improvment for frequent insensitive data
-In file level, it extracts the immutable parts and utilize them as a file fingerprint
-> accelerate the key generation
-> 
-
-$$
-h_f = H(F_1||F_2||F_3)
-$$
-
-- Slowing down online brute-force attack
-1. per-backup rate-limiting policy:
-Given the projected backup data size and expected chunk size, it can get the budget to bound the number of requests that are allowed to be processed by key server.
-
-2. the request can be processed during a prescribed time window.
-
-- Improving Data Restore Speed
-This proposed scheme will naturally enable better read performance for a user
-> allow a duplicate chunk copy under one secret to be kept in the storage without referring it to an existing copy under another secret in an old container.
-
-Further improve the read performance: reconstruction-aware chunk placement mechanism
-> enforce a spatial locality for chunks
-> chunks under the same key server secret are stored close to each other.
-
-a slight loss of deduplication so as to achieve the desired security objectives, the chunk fragmentation level for user backup is also reduced.
-
-### Implementation and Evaluation
-- Evaluation
-Dataset: FSL MacOS Snapshot-(variable-sized chunking with average chunk size 8KB)
-Size: the total size before deduplication is 463TB
-
-TCP-based randomized oblivious key generation protocol: Python
-Deduplication simulator: C
-Restore simulator: C
-
-## 2. Strength (Contributions of the paper)
-1. this work proposes a secure deduplication considering the case for multi-users scenario. Its idea is very simple but effective. Besides the gain in security, but also for the aspect of better read performance.
-## 3. Weakness (Limitations of the paper)
-1. For each chunk, this scheme needs to generate a separate key which incurs a high overhead. (although its file-level key scheme can mitigate this impact)
-## 4. Future Works
-1. The key issue of this paper is how to guarantee the security of low-entropy chunks. And it argues other methods by saying that they do not consider the protection of low-entropy chunks and practical dedupe performance.
+---
+typora-copy-images-to: paper_figure
+---
+Tapping the Potential: Secure Chunk-based Deduplication of Encrypted Data for Cloud Backup
+------------------------------------------
+| Venue | Category |
+| :---------: | :------------------: |
+| IEEE CNS'18 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Existing the secure deduplication designs are at odds with the real-world dedupe requirements in terms of security and performance. (pay little attention to the challenges and practical requirements of chunk-based deduplication (CBD))
+
+
+The practical requirements:
+1. Low-entropy chunk: brute-force attack for predictable files. CBD will amplify the attack efficacy due to the potentially much lower entropy contained in a small chunk.
+> Question: how to reduce the risk of the information leakage with minimal impact on the underlying deduplication routine?
+
+2. Increased system operation overhead: incurs higher latency and computation overhead, since the client needs to run the key generation protocol with other online parties (a key server or peer clients)
+> Question: can it speed up the key generation while still ensuring an effective deduplication function?
+
+3. Practical dedupe performance: in addition to the deduplication ratio, it also needs to concern chunk fragmentation level, or restore speed.
+A high level fragmentation level typically adversely affects the system read performance and further increase the data restore cost.
+> Question: any secure chunk-level dedupe design should provide a read performance on par with plaintext CBD practice.
+
+
+This paper intends to solve those three challenges.
+
+### Tapping the Potential
+- Main idea: using randomized encryption will completely protect the low-entropy chunks
+> will outright incapacitate the deduplication, this is the desired asymmetry between security and performance, but only with **minimal dedupe performance loss**.
+
+- System Model
+This paper focuses on protecting the confidentiality of predictable data
+> It assumes it can achieve the semantic security for unpredictable data with CE.
+
+All the communication channels between clients, the key server, and the backup storage system are secure.
+
+A ideal functionality $F_{dedupe}$:
+> 1. input - client: chunk $ch$, key server: a chosen secret $d_t$
+> 2. output - client: the chunk key $k$, key server: a sign, storage server: the ciphertext of chunk
+
+Goal: a probabilistic polynomial-time (PPT) adversary cannot distinguish the real-world execution of the proposed scheme.
+
+- Randomized Oblivious Key Generation (ROKG)
+Chunk encryption key can be generated by running a secure (oblivious) two-party computation between the client and the key server.
+> the key server learns nothing on the client's input and algorithm output
+> client cannot infer key server's secret
+
+Some points:
+1. a chunk obfuscation algorithm takes as input the chunk data $ch$, a random number $r$, then outputs the obfuscated chunk data $z$.
+2. The modified version of blind RSA signature
+3. In the system setup phase, the key server would generate the $n$ pairs of RSA sets, for each user, it would choose only one from the this set and blind this with the user's ID.
+
+The key design is it uses $n$ pairs of RSA, so that given any compromised client $C_j$ out of $s$ clients in the system
+> the adversary can only infer at most $\frac{s}{n}$ client's data on storage servers. ($s \geq n$)
+
+It can tweak the parameter $n$ to accommodate the real network scale.
+> the number of compromised machines depends on company's size but also
+
+- Impact on deduplication
+Model:
+the store data: $x+y$
+> $x$: the size of data that cannot be deduplicated across all the users
+> $y$: the size of data that have been deduplicated
+> degradation ratio: $\frac{x+y}{x+ny}$
+
+This can proves that when $x$ outsizes $y$ significantly, selecting a small or moderate $n$ will not introduce an obvious performance penalty.
+> the workload is deduplication-unfriendly.
+
+- Efficiency improvment for frequent insensitive data
+In file level, it extracts the immutable parts and utilize them as a file fingerprint
+> accelerate the key generation
+> 
+
+$$
+h_f = H(F_1||F_2||F_3)
+$$
+
+- Slowing down online brute-force attack
+1. per-backup rate-limiting policy:
+Given the projected backup data size and expected chunk size, it can get the budget to bound the number of requests that are allowed to be processed by key server.
+
+2. the request can be processed during a prescribed time window.
+
+- Improving Data Restore Speed
+This proposed scheme will naturally enable better read performance for a user
+> allow a duplicate chunk copy under one secret to be kept in the storage without referring it to an existing copy under another secret in an old container.
+
+Further improve the read performance: reconstruction-aware chunk placement mechanism
+> enforce a spatial locality for chunks
+> chunks under the same key server secret are stored close to each other.
+
+a slight loss of deduplication so as to achieve the desired security objectives, the chunk fragmentation level for user backup is also reduced.
+
+### Implementation and Evaluation
+- Evaluation
+Dataset: FSL MacOS Snapshot-(variable-sized chunking with average chunk size 8KB)
+Size: the total size before deduplication is 463TB
+
+TCP-based randomized oblivious key generation protocol: Python
+Deduplication simulator: C
+Restore simulator: C
+
+## 2. Strength (Contributions of the paper)
+1. this work proposes a secure deduplication considering the case for multi-users scenario. Its idea is very simple but effective. Besides the gain in security, but also for the aspect of better read performance.
+## 3. Weakness (Limitations of the paper)
+1. For each chunk, this scheme needs to generate a separate key which incurs a high overhead. (although its file-level key scheme can mitigate this impact)
+## 4. Future Works
+1. The key issue of this paper is how to guarantee the security of low-entropy chunks. And it argues other methods by saying that they do not consider the protection of low-entropy chunks and practical dedupe performance.
2. One insight from this paper is in its threat model: it considers the case of **multi-client compromise resilience**, which is always be ignored in many papers.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Secure-Dedup/UWare-DSC'18.md b/StoragePaperNote/Deduplication/Secure-Dedup/UWare-DSC'18.md
old mode 100644
new mode 100755
index 0604702..ef5df84
--- a/StoragePaperNote/Deduplication/Secure-Dedup/UWare-DSC'18.md
+++ b/StoragePaperNote/Deduplication/Secure-Dedup/UWare-DSC'18.md
@@ -1,79 +1,79 @@
----
-typora-copy-images-to: ../paper_figure
----
-A Bandwidth-Efficient Middleware for Encryption Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| DSC'12 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Existing client-side deduplication designs are not directly accplicable to build a secure deduplication middleware.
-> they are inherently vulnerable to the ownership cheating attacks (hash-only attacks).
-
-1. An overlooked side-channel in client-side deduplication
-the PoW can be abused to turn the deduplication server into an oracle, allowing an attacker to learn the file existence by observing whether or not the PoW testing is performed.
-
-2. Tradeoff among various deduplication modes
-The contradiction between file- and block- level deduplication approaches urges a solution that acquires an acceptable deduplication ratio while alleviating performance issues.
-
-This paper wants to solve those two issues above.
-> leverage the similarity characteristic of file blocks in secure deduplication.
-
-### UWare
-
-- System overview
-
-UWare will perform secure deduplication services at** the gateway of enterprise-level networks**.
-> 1. **client** is installed at a local device, encrypt and upload files to backend storage
-> 2. **Storage backend** stores all user's data
-> 3. **UWare** indexes short information of encrypted file/blocks of all users for efficiency.
-
-
-- Threats
-1. A malicious user
-attemp to launch the ownership cheating attacks or the existence-of-file attacks by using some short information.
-2. A compromised cloud storage server
-attemp to steal and learn the underlying content of the stored file ciphertexts.
-
-
-- Overlooked side-channel in deployment if their PoW protocols
-An attacker who has a file hash can know if it exists without having to complete the PoW testing.
-> Existing deduplication design perform the PoW testing after the duplicate checking passes.
-> A promising solution: make the file existence oblivious unless the user passes the PoW testing.
-
-
-
-**Key idea**: no matter initial checking successes or not, the challenge message $(r, r^{\*})$ is sent back to enforce PoW execution.
-> The user who just holds a file hash is still requested to upload the corresponding file ciphertext as she cannot compute the correct proof.
-
-- Performance issues on memory space and bandwidth for block-level deduplication
-This paper elaborates a tunable design to balance the deduplication performance and system efficiency for secure deduplication.
-> using near-exact deduplication (Broder's theorem), a small group of sampled block tags can approximately represent the entire block tags of the file.
-
-
-- Per-file randomness design
-Each file regardless of how many blocks it has, will be assigned a single randomness $r$ for the key generation.
-> assign the same randomness of the most similiar file in UWare.
-
-
-### Implementation and Evaluation
-- Evaluation
-Dataset: FSLHome-2014, 564GB
-1. Deduplication effectiveness
-Compared with plaintext deduplication
-2. Index space overhead
-Compared with plaintext deduplication
-3. Service overhead
-The time overhead of **tag generation** and **data encryption**.
-
-
-## 2. Strength (Contributions of the paper)
-1. This paper patches the PoW protocol to address the aforementioned file-existence side-channel under both threats.
-## 3. Weakness (Limitations of the paper)
-1. The key drawback of its method is it needs to do the encryption whether the file exists or not.
-## 4. Future Works
-1. This paper assumes the ciphertexts of unpredictable message cannot be distinguished by an efficient attacker except with negligible probability.
+---
+typora-copy-images-to: ../paper_figure
+---
+A Bandwidth-Efficient Middleware for Encryption Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| DSC'12 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Existing client-side deduplication designs are not directly accplicable to build a secure deduplication middleware.
+> they are inherently vulnerable to the ownership cheating attacks (hash-only attacks).
+
+1. An overlooked side-channel in client-side deduplication
+the PoW can be abused to turn the deduplication server into an oracle, allowing an attacker to learn the file existence by observing whether or not the PoW testing is performed.
+
+2. Tradeoff among various deduplication modes
+The contradiction between file- and block- level deduplication approaches urges a solution that acquires an acceptable deduplication ratio while alleviating performance issues.
+
+This paper wants to solve those two issues above.
+> leverage the similarity characteristic of file blocks in secure deduplication.
+
+### UWare
+
+- System overview
+
+UWare will perform secure deduplication services at** the gateway of enterprise-level networks**.
+> 1. **client** is installed at a local device, encrypt and upload files to backend storage
+> 2. **Storage backend** stores all user's data
+> 3. **UWare** indexes short information of encrypted file/blocks of all users for efficiency.
+
+
+- Threats
+1. A malicious user
+attemp to launch the ownership cheating attacks or the existence-of-file attacks by using some short information.
+2. A compromised cloud storage server
+attemp to steal and learn the underlying content of the stored file ciphertexts.
+
+
+- Overlooked side-channel in deployment if their PoW protocols
+An attacker who has a file hash can know if it exists without having to complete the PoW testing.
+> Existing deduplication design perform the PoW testing after the duplicate checking passes.
+> A promising solution: make the file existence oblivious unless the user passes the PoW testing.
+
+
+
+**Key idea**: no matter initial checking successes or not, the challenge message $(r, r^{\*})$ is sent back to enforce PoW execution.
+> The user who just holds a file hash is still requested to upload the corresponding file ciphertext as she cannot compute the correct proof.
+
+- Performance issues on memory space and bandwidth for block-level deduplication
+This paper elaborates a tunable design to balance the deduplication performance and system efficiency for secure deduplication.
+> using near-exact deduplication (Broder's theorem), a small group of sampled block tags can approximately represent the entire block tags of the file.
+
+
+- Per-file randomness design
+Each file regardless of how many blocks it has, will be assigned a single randomness $r$ for the key generation.
+> assign the same randomness of the most similiar file in UWare.
+
+
+### Implementation and Evaluation
+- Evaluation
+Dataset: FSLHome-2014, 564GB
+1. Deduplication effectiveness
+Compared with plaintext deduplication
+2. Index space overhead
+Compared with plaintext deduplication
+3. Service overhead
+The time overhead of **tag generation** and **data encryption**.
+
+
+## 2. Strength (Contributions of the paper)
+1. This paper patches the PoW protocol to address the aforementioned file-existence side-channel under both threats.
+## 3. Weakness (Limitations of the paper)
+1. The key drawback of its method is it needs to do the encryption whether the file exists or not.
+## 4. Future Works
+1. This paper assumes the ciphertexts of unpredictable message cannot be distinguished by an efficient attacker except with negligible probability.
2. This paper argues that near-exact deduplication can achieve lower memory cost at a cost of decreasing deduplication ratio.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Summary/99DeduplicationProblem-HotStorage'16.md b/StoragePaperNote/Deduplication/Summary/99DeduplicationProblem-HotStorage'16.md
old mode 100644
new mode 100755
index c6b43f8..7117c76
--- a/StoragePaperNote/Deduplication/Summary/99DeduplicationProblem-HotStorage'16.md
+++ b/StoragePaperNote/Deduplication/Summary/99DeduplicationProblem-HotStorage'16.md
@@ -1,80 +1,80 @@
----
-typora-copy-images-to: paper_figure
----
-99 Deduplication Problems
-------------------------------------------
-| Venue | Category |
-| :-----------: | :-----------: |
-| HotStorage'16 | Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Most deduplication-related publications focus on a narrow range of topic:
-> maximizing deduplication ratios
-> read/write performance
-
-This papers believes that there are numerous novel, deduplication-specific problems that have been largely ignored in the academic community.
-
-
-
-### 99 Deduplication Problems
-- Capacity
-
-> non-deduplication system: a fairly straightforward to answer this kind of question since such systems
-> **How to write data?** (the logical write value is not real value that to write)
-> **How to delete data?** (the logical free value is not the real return value)
-> A deduplication system is dynamic, it is hard to track a file in real time
->
-> > an intuitive way is collected information periodically.
-
-**Future Research Opportunities**:
-1. Offline process: periodically calculate the unique content for each file
-2. Inline process: provide hints about capacity that are at least accurate within an approximation threshold.
-3. Estimation size: estimation tools, capacity usage, performance, network usage
-
-
-- Quality of Service
-A deduplication storage system should also meet the requirement regarding to quality of service for a client.
-> latency, throughput
-> Root reason: adds additional levels of indirection to map from a file representation to the data chunks locations.
-> Performance drop off: turn sequentially written content into references to chunks scattered across the HDDs.
-
-**Future Research Opportunities**
-1. Restore performance
-2. Garbage collection
-3. arware the potential latency of various operations
-4. QoS: latency, throughput, and priority level
-> Shared content creates unpredicatable performance.
-
-
-
-- Security and Reliability
-Maintain advantages of deduplication while securely storing data, preventing
-> 1. Unauthorized access
-> 2. Knowledge of content
-> 3. Data tampering
-
-By timing data transfers, it may also be possible to infer what already exists on a deduplicaion servers.
-
-**Reliability**
-the complex relationship between deduplication and data reliability.
-> Intuitively, the combination of RAID, versioning, and replicating counterbalances a risk of data loss due deduplication
-> How to quantitatively analyzing the reliability?
-
-**Future Research Opportunities**
-1. complete an empirical measurement of data loss rates for various flavors of deduplicated storage.
-> needs vendors to release such information
-
-2. model the risk of each component of the system using published failure characteristics and calculate the reliability for the storage environment.
-
-- Chargeback for service providers
-
-**Future Research Opportunities**
-1. need to accurate measurements for post-deduplication resource usage (capacity, I/O, network, etc.)
-
-## 4. Future Works
-After reading this paper, I can get a high level picture of potential research topics in deduplication area. Some points can be followed
-> Security and reliability
-> Capacity measurement
+---
+typora-copy-images-to: paper_figure
+---
+99 Deduplication Problems
+------------------------------------------
+| Venue | Category |
+| :-----------: | :-----------: |
+| HotStorage'16 | Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Most deduplication-related publications focus on a narrow range of topic:
+> maximizing deduplication ratios
+> read/write performance
+
+This papers believes that there are numerous novel, deduplication-specific problems that have been largely ignored in the academic community.
+
+
+
+### 99 Deduplication Problems
+- Capacity
+
+> non-deduplication system: a fairly straightforward to answer this kind of question since such systems
+> **How to write data?** (the logical write value is not real value that to write)
+> **How to delete data?** (the logical free value is not the real return value)
+> A deduplication system is dynamic, it is hard to track a file in real time
+>
+> > an intuitive way is collected information periodically.
+
+**Future Research Opportunities**:
+1. Offline process: periodically calculate the unique content for each file
+2. Inline process: provide hints about capacity that are at least accurate within an approximation threshold.
+3. Estimation size: estimation tools, capacity usage, performance, network usage
+
+
+- Quality of Service
+A deduplication storage system should also meet the requirement regarding to quality of service for a client.
+> latency, throughput
+> Root reason: adds additional levels of indirection to map from a file representation to the data chunks locations.
+> Performance drop off: turn sequentially written content into references to chunks scattered across the HDDs.
+
+**Future Research Opportunities**
+1. Restore performance
+2. Garbage collection
+3. arware the potential latency of various operations
+4. QoS: latency, throughput, and priority level
+> Shared content creates unpredicatable performance.
+
+
+
+- Security and Reliability
+Maintain advantages of deduplication while securely storing data, preventing
+> 1. Unauthorized access
+> 2. Knowledge of content
+> 3. Data tampering
+
+By timing data transfers, it may also be possible to infer what already exists on a deduplicaion servers.
+
+**Reliability**
+the complex relationship between deduplication and data reliability.
+> Intuitively, the combination of RAID, versioning, and replicating counterbalances a risk of data loss due deduplication
+> How to quantitatively analyzing the reliability?
+
+**Future Research Opportunities**
+1. complete an empirical measurement of data loss rates for various flavors of deduplicated storage.
+> needs vendors to release such information
+
+2. model the risk of each component of the system using published failure characteristics and calculate the reliability for the storage environment.
+
+- Chargeback for service providers
+
+**Future Research Opportunities**
+1. need to accurate measurements for post-deduplication resource usage (capacity, I/O, network, etc.)
+
+## 4. Future Works
+After reading this paper, I can get a high level picture of potential research topics in deduplication area. Some points can be followed
+> Security and reliability
+> Capacity measurement
>
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Workload-Analysis/BackupWorkloads-FAST'12.md b/StoragePaperNote/Deduplication/Workload-Analysis/BackupWorkloads-FAST'12.md
old mode 100644
new mode 100755
index 4388509..6605d71
--- a/StoragePaperNote/Deduplication/Workload-Analysis/BackupWorkloads-FAST'12.md
+++ b/StoragePaperNote/Deduplication/Workload-Analysis/BackupWorkloads-FAST'12.md
@@ -1,117 +1,117 @@
----
-typora-copy-images-to: ../paper_figure
----
-Characteristics of Backup Workloads in Production Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'12 | Workload Analysis |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-There have been numerous studies over the past 30 years of file system characteristics.
-> there has been little in the way of corresponding studies for **backup** systems.
-> data backups are used to protect primary data.
-
-- Goal
-highlight the different requirements between **backup** and **primary** storage.
-> backup filesystems have had to scale their throughput to meet storage growth.
-
-### Backup analysis
-- Data collection
-The most practical way of conducting a large-scale study is to instead collect filesystem-level statistics and content metadata. (i.e., data about the data)
-> 1. autosupport reports (storage usage, compression, file counts and ages, caching statistics and other metrics): limited in detail but wide in deployment.
-> 2. chunk-level metadata (chunk hash identifiers, sizes and location): contain great detail but are limited in deployment.
-
-
-- Collecting content metadata
-collect the file recipes (listing of chunk fingerprints) for each file and then collect the deduplicated chunk metadata from the storage containers, as well as sub-chunk fingerprints.
-> get the information of sub-chunk: can be used to investigate deduplication rates at various chunk sizes smaller than the default 8KB.
-
-- Trends across backup storage systems
-Graph their primary workload results alongside its backup storage results.
-> both a histogram (probability distribution) and a cumulative distribution function (CDF)
-
-1. File size
-For backup, this size distribution is about 3 orders of magnitude larger than for primary files.
-> since it combines individual files together from the primary storage system into "tar-like" collections.
-> larger files reduce the likelihood of whole-file deduplication but increase the *stream locality* within the system.
-
-2. File and directory count
-File and directory counts are typically much lower in backup workloads.
-
-
-3. File age
-For backup workload, the median age is about 3 weeks.
-> short retention periods lead to higher data churn.
-
-
-- Effect of varying chunk size
-
-1. Metadata overhead
-Every chunk requires certain metadata to track its location
-> the aggregate overhead scales inversely with chunk size.
-
-It assumes a small fixed cost (30 bytes, a fingerprint, chunk length, and a small overhead for other metadata)
-> per physical chunk stored in the system
-> per logical chunk in a file recipe
-
-use a factor $f$ to report the reduction in deduplication effectiveness
-> $f$: the metadata size divided by the average chunk size.
-
-the real deduplication $D^{'}$ includes metadata costs:
-$$
-D^{'} = \frac{L}{P+fP+fL}
-$$
-without metadata costs:
-$$
-D = \frac{L}{P}
-$$
-> Sometime the improvement in deduplication of small chunk size sufficiently compensates for the added per-chunk metadata.
-
-
-### Implementation and Evaluation
-- Evaluation
-1. Cache efficiency
-
-Writes: Using stream locality hints achieves good deduplication hit rates with caches.
-> stream locality: container level
-
-
-Reads: use cache to provide fast restores of data during disaster recovery.
-
-
-## 2. Strength (Contributions of the paper)
-1. analyze statistics from a broad set of 10,000+ production EMC Data Domain systems
-> show the backup workload tends to have shorter-lived and larger files than primary storage.
-
-2. uses a novel technique for extrapolating deduplication rates across a range of possible sizes.
-> using a single chunk size to extrapolate deduplication at larger chunk sizes.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-1. It mentions that the backup storage workloads are tied to the **applications**
-
-> depends on the applications which generates them
-
-Backup: individual files are typically combined into large units ("tar" file)
-
-2. weekly "full" backups and daily "incremental" backups
-
-Incremental backups: files are modified (a large portions in common with earlier versions)
-Full backups: are likely to have many of their comprising files completely unmodified
-
-
-3. Backup system
-
-Windows 2000: entire files deduplication
-Venti: fixed-block deduplication
-LBFS: variable-sized chunks deduplication
-
-4. Compression region
-
-Combining unique chunks into "compression regions"
-> aggregate new unique chunks into compression regions, which are compressed as a single unit (approximately 128KB before compression)
+---
+typora-copy-images-to: ../paper_figure
+---
+Characteristics of Backup Workloads in Production Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'12 | Workload Analysis |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+There have been numerous studies over the past 30 years of file system characteristics.
+> there has been little in the way of corresponding studies for **backup** systems.
+> data backups are used to protect primary data.
+
+- Goal
+highlight the different requirements between **backup** and **primary** storage.
+> backup filesystems have had to scale their throughput to meet storage growth.
+
+### Backup analysis
+- Data collection
+The most practical way of conducting a large-scale study is to instead collect filesystem-level statistics and content metadata. (i.e., data about the data)
+> 1. autosupport reports (storage usage, compression, file counts and ages, caching statistics and other metrics): limited in detail but wide in deployment.
+> 2. chunk-level metadata (chunk hash identifiers, sizes and location): contain great detail but are limited in deployment.
+
+
+- Collecting content metadata
+collect the file recipes (listing of chunk fingerprints) for each file and then collect the deduplicated chunk metadata from the storage containers, as well as sub-chunk fingerprints.
+> get the information of sub-chunk: can be used to investigate deduplication rates at various chunk sizes smaller than the default 8KB.
+
+- Trends across backup storage systems
+Graph their primary workload results alongside its backup storage results.
+> both a histogram (probability distribution) and a cumulative distribution function (CDF)
+
+1. File size
+For backup, this size distribution is about 3 orders of magnitude larger than for primary files.
+> since it combines individual files together from the primary storage system into "tar-like" collections.
+> larger files reduce the likelihood of whole-file deduplication but increase the *stream locality* within the system.
+
+2. File and directory count
+File and directory counts are typically much lower in backup workloads.
+
+
+3. File age
+For backup workload, the median age is about 3 weeks.
+> short retention periods lead to higher data churn.
+
+
+- Effect of varying chunk size
+
+1. Metadata overhead
+Every chunk requires certain metadata to track its location
+> the aggregate overhead scales inversely with chunk size.
+
+It assumes a small fixed cost (30 bytes, a fingerprint, chunk length, and a small overhead for other metadata)
+> per physical chunk stored in the system
+> per logical chunk in a file recipe
+
+use a factor $f$ to report the reduction in deduplication effectiveness
+> $f$: the metadata size divided by the average chunk size.
+
+the real deduplication $D^{'}$ includes metadata costs:
+$$
+D^{'} = \frac{L}{P+fP+fL}
+$$
+without metadata costs:
+$$
+D = \frac{L}{P}
+$$
+> Sometime the improvement in deduplication of small chunk size sufficiently compensates for the added per-chunk metadata.
+
+
+### Implementation and Evaluation
+- Evaluation
+1. Cache efficiency
+
+Writes: Using stream locality hints achieves good deduplication hit rates with caches.
+> stream locality: container level
+
+
+Reads: use cache to provide fast restores of data during disaster recovery.
+
+
+## 2. Strength (Contributions of the paper)
+1. analyze statistics from a broad set of 10,000+ production EMC Data Domain systems
+> show the backup workload tends to have shorter-lived and larger files than primary storage.
+
+2. uses a novel technique for extrapolating deduplication rates across a range of possible sizes.
+> using a single chunk size to extrapolate deduplication at larger chunk sizes.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. It mentions that the backup storage workloads are tied to the **applications**
+
+> depends on the applications which generates them
+
+Backup: individual files are typically combined into large units ("tar" file)
+
+2. weekly "full" backups and daily "incremental" backups
+
+Incremental backups: files are modified (a large portions in common with earlier versions)
+Full backups: are likely to have many of their comprising files completely unmodified
+
+
+3. Backup system
+
+Windows 2000: entire files deduplication
+Venti: fixed-block deduplication
+LBFS: variable-sized chunks deduplication
+
+4. Compression region
+
+Combining unique chunks into "compression regions"
+> aggregate new unique chunks into compression regions, which are compressed as a single unit (approximately 128KB before compression)
diff --git a/StoragePaperNote/Deduplication/Workload-Analysis/CapacityForecasting-LISA'11.md b/StoragePaperNote/Deduplication/Workload-Analysis/CapacityForecasting-LISA'11.md
old mode 100644
new mode 100755
index f17fc7a..78ba78c
--- a/StoragePaperNote/Deduplication/Workload-Analysis/CapacityForecasting-LISA'11.md
+++ b/StoragePaperNote/Deduplication/Workload-Analysis/CapacityForecasting-LISA'11.md
@@ -1,100 +1,100 @@
----
-typora-copy-images-to: ../paper_figure
----
-Capacity Forecasting in a Backup Storage Environment
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| LISA'11 | Workload Analysis |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Many system administrators already have historical data for their systems and thus can predict full capacity events in advance.
-
-It needs a proactive tool
-> 1. predicts the date of full capacity and provides advance notification.
-> 2. there seems to be little previous work discussing applications of predictive modeling to data storage environments.
-
-This paper presents the predictive model employed internally at EMC to forecast system capacity.
-> generate alert notification months before systems reach full capacity.
-
-
-### Data Domain
-- Data collection
-1. employ inline deduplication technology on disk.
-> Customers can configure their Data Domain systems to send an email everyday with detailed **diagnostic information**.
-
-2. Most customers choose to send autosupports to EMC
-> the historical data enables more effective customer support.
-
-Two variables of capacity forecasting:
-> 1. Total physical capacity of the system (changes over time)
-> 2. Total physical space used by the system
-
-- Normal predictive model
-1. the most common methods employed in predictive modeling is **linear regression**
-> 1. This is challenging because behavior changes
-> 2. blind application of regression to the entire data set often leads to poor predictions.
-> 
-
-2. select a subset of recent data
-choose a subset of recent data such as the prior 30 days
-> 1. eliminates the influence of the older data and improves the accuracy of the model's predictions.
-
-
-- Piecewise linear regression
-**Main idea**: analyze the quality of many linear regressions and then selects the one having the best fit.
-
-1. How to reduce the error rate of the original linear regression model
-> 1. applying the regression to a data subset that best represents the most recent behavior.
-
-2. How to find the best subset of data?
-> 1. the boundary must be determined where the recent behavior begins to deviate.
-> 2. "goodness-of-fit" of a linear regression: $R^2 = 1$ indicates perfectly linear data
-$$
-R^2 = \frac{SSM}{SST} = \frac{\sum_i [f(x_i) - \bar{y}]}{\sum_i [y_i - \bar{y}]}
-$$
-> 3. select the subset with maximum $R^2$, from $\{(x_{-n}, y_{-n}),......,(x_0, y_0)\}$.
-> 4. the calculated boundary occurs near the discontinuity of the truc function.
-
-
-
-
-- Model validation
-Validation rules are applied to the results of the linear model to determine if capacity forecasts should be pulished.
-> 1. Goodness-of-fit
-> 2. positive slope
-
-### Implementation and Evaluation
-- Evaluation
-1. Analysis of the quality of forecasts
-> false positive: hardware changes, software changes
-> from a statistical perspective, it is unknown whether the recent data points are signal or noise.
-
-2. Capacity forecasting example
-In Data Domain storage systems
-
-## 2. Strength (Contributions of the paper)
-1. This paper shows that there is a trade-off in predicative model
-eliminating reasonable models vs. generating false positives
-> By requiring more data for models, it can gain higher confidence in their predictions, but reduce the advanced notification for true positives.
-
-## 3. Weakness (Limitations of the paper)
-1. If historical data does not demonstrate linear growth, then obviously linear regression would be a poor.
-
-
-## 4. Future Works
-1. In this paper, it mentions there exists many other models which can be applied to **time series data**.
-> 1. weighted linear regression
-> 2. logarithmic regression
-> 3. auto-regressive (AR) model
-
-There is an open question whether the remaining systems can be modeled by other methods.
-
-2. How to improve this model to be compatible with some other systems? or find other applications of this predicative model?
-predicate bandwidth throughput, load-balancing or I/O capacity.
-
-3. The paper shows that the majority systems exhibit very linear behavior since the linear model had a very good fit the datasets.
-
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Capacity Forecasting in a Backup Storage Environment
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| LISA'11 | Workload Analysis |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Many system administrators already have historical data for their systems and thus can predict full capacity events in advance.
+
+It needs a proactive tool
+> 1. predicts the date of full capacity and provides advance notification.
+> 2. there seems to be little previous work discussing applications of predictive modeling to data storage environments.
+
+This paper presents the predictive model employed internally at EMC to forecast system capacity.
+> generate alert notification months before systems reach full capacity.
+
+
+### Data Domain
+- Data collection
+1. employ inline deduplication technology on disk.
+> Customers can configure their Data Domain systems to send an email everyday with detailed **diagnostic information**.
+
+2. Most customers choose to send autosupports to EMC
+> the historical data enables more effective customer support.
+
+Two variables of capacity forecasting:
+> 1. Total physical capacity of the system (changes over time)
+> 2. Total physical space used by the system
+
+- Normal predictive model
+1. the most common methods employed in predictive modeling is **linear regression**
+> 1. This is challenging because behavior changes
+> 2. blind application of regression to the entire data set often leads to poor predictions.
+> 
+
+2. select a subset of recent data
+choose a subset of recent data such as the prior 30 days
+> 1. eliminates the influence of the older data and improves the accuracy of the model's predictions.
+
+
+- Piecewise linear regression
+**Main idea**: analyze the quality of many linear regressions and then selects the one having the best fit.
+
+1. How to reduce the error rate of the original linear regression model
+> 1. applying the regression to a data subset that best represents the most recent behavior.
+
+2. How to find the best subset of data?
+> 1. the boundary must be determined where the recent behavior begins to deviate.
+> 2. "goodness-of-fit" of a linear regression: $R^2 = 1$ indicates perfectly linear data
+$$
+R^2 = \frac{SSM}{SST} = \frac{\sum_i [f(x_i) - \bar{y}]}{\sum_i [y_i - \bar{y}]}
+$$
+> 3. select the subset with maximum $R^2$, from $\{(x_{-n}, y_{-n}),......,(x_0, y_0)\}$.
+> 4. the calculated boundary occurs near the discontinuity of the truc function.
+
+
+
+
+- Model validation
+Validation rules are applied to the results of the linear model to determine if capacity forecasts should be pulished.
+> 1. Goodness-of-fit
+> 2. positive slope
+
+### Implementation and Evaluation
+- Evaluation
+1. Analysis of the quality of forecasts
+> false positive: hardware changes, software changes
+> from a statistical perspective, it is unknown whether the recent data points are signal or noise.
+
+2. Capacity forecasting example
+In Data Domain storage systems
+
+## 2. Strength (Contributions of the paper)
+1. This paper shows that there is a trade-off in predicative model
+eliminating reasonable models vs. generating false positives
+> By requiring more data for models, it can gain higher confidence in their predictions, but reduce the advanced notification for true positives.
+
+## 3. Weakness (Limitations of the paper)
+1. If historical data does not demonstrate linear growth, then obviously linear regression would be a poor.
+
+
+## 4. Future Works
+1. In this paper, it mentions there exists many other models which can be applied to **time series data**.
+> 1. weighted linear regression
+> 2. logarithmic regression
+> 3. auto-regressive (AR) model
+
+There is an open question whether the remaining systems can be modeled by other methods.
+
+2. How to improve this model to be compatible with some other systems? or find other applications of this predicative model?
+predicate bandwidth throughput, load-balancing or I/O capacity.
+
+3. The paper shows that the majority systems exhibit very linear behavior since the linear model had a very good fit the datasets.
+
+
diff --git a/StoragePaperNote/Deduplication/Workload-Analysis/PracticalDedup-FAST'11.md b/StoragePaperNote/Deduplication/Workload-Analysis/PracticalDedup-FAST'11.md
old mode 100644
new mode 100755
index 97fa081..75d44cd
--- a/StoragePaperNote/Deduplication/Workload-Analysis/PracticalDedup-FAST'11.md
+++ b/StoragePaperNote/Deduplication/Workload-Analysis/PracticalDedup-FAST'11.md
@@ -1,75 +1,75 @@
----
-typora-copy-images-to: ../paper_figure
----
-A Study of Practical Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'11 | Deduplication workload analysis |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Deduplication can work at either the sub-file or whole-file level.
-> More fine-grained deduplication creates more opportunities for space savings.
-> Drawback: reduces the sequential layout of some files, impacts the performance when disks are used for storage.
-
-This paper consists of 857 file systems spanning 162 terabytes of disk over 4 weeks.
-> from a broad cross-section of employees
-
-This paper also conducts a study of metadata adn data layout.
-> 1. storage being consumed by files of increasing size continues unabated.
-> 2. file-level fragmentation is not widespread.
-
-### MS Trace
-
-- Use salted MD5 as its hash algorithm
-Truncate teh result to 48 bits
-> in order to reduce the size of the data set.
-> had the largest number of unique hashes, somewhat more than 768M, expect that about two thousand of those (0.0003%) are false matches due the truncated hash.
-
-- Post processing
-As an optimization, it observes the actual value of any unique hash was not useful to its analysis. (i.e., hashes of content that was not duplicated)
-> Novel 2-pass algorithm:
-> First pass: if it tried to insert a value that was already in the Bloom filter, then it inserts it into a second bloom filter of equal size.
-> Second pass: comparing the each hash to the second bloom filter only, if it was not found in the second filter, it was certain that the hash had been seen exactly once and could be omitted from the database. (very simple)
-
-- Consider different deduplication domain
-1. Deduplication in primary storage
-
-
-2. Deduplication in backup storage
-Performance in secondary storage is less critical than in that of primary, so the reduced sequentiality of a block-level deduplicated store is of lesser concern.
-
-
-- Metadata analysis
-This paper analyzes the file systems in terms of
-> age, capacity, fullness
-> the number of files and directories
-
-
-- File times
-This paper shows that most files are modified between one month and a year ago, but about 20% are modified within the last month.
-
-- On-disk layout
-The behavior and characteristics of magnetic disks continue to be a dominant concern in storage system.
-> It argues that it is not true that file system performance changes over time, largely due to fragmentation.
-> Because of the defragmenter in modern operating system
-
-### Implementation and Evaluation
-- UBC data set
-This data set contains scans of 857 file systems hosted on 597 computers
-> windows, windows vista, windows server
-
-## 2. Strength (Contributions of the paper)
-1. this paper leverages a two-pass to filter out those chunks which only appears once.
-
-2. The main contribution of this work is it also tracks file system fragmentation and data placement, which hash not been analyzed previously or at large scale.
-> file system data
-> metadata
-> data layout out
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: ../paper_figure
+---
+A Study of Practical Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'11 | Deduplication workload analysis |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Deduplication can work at either the sub-file or whole-file level.
+> More fine-grained deduplication creates more opportunities for space savings.
+> Drawback: reduces the sequential layout of some files, impacts the performance when disks are used for storage.
+
+This paper consists of 857 file systems spanning 162 terabytes of disk over 4 weeks.
+> from a broad cross-section of employees
+
+This paper also conducts a study of metadata adn data layout.
+> 1. storage being consumed by files of increasing size continues unabated.
+> 2. file-level fragmentation is not widespread.
+
+### MS Trace
+
+- Use salted MD5 as its hash algorithm
+Truncate teh result to 48 bits
+> in order to reduce the size of the data set.
+> had the largest number of unique hashes, somewhat more than 768M, expect that about two thousand of those (0.0003%) are false matches due the truncated hash.
+
+- Post processing
+As an optimization, it observes the actual value of any unique hash was not useful to its analysis. (i.e., hashes of content that was not duplicated)
+> Novel 2-pass algorithm:
+> First pass: if it tried to insert a value that was already in the Bloom filter, then it inserts it into a second bloom filter of equal size.
+> Second pass: comparing the each hash to the second bloom filter only, if it was not found in the second filter, it was certain that the hash had been seen exactly once and could be omitted from the database. (very simple)
+
+- Consider different deduplication domain
+1. Deduplication in primary storage
+
+
+2. Deduplication in backup storage
+Performance in secondary storage is less critical than in that of primary, so the reduced sequentiality of a block-level deduplicated store is of lesser concern.
+
+
+- Metadata analysis
+This paper analyzes the file systems in terms of
+> age, capacity, fullness
+> the number of files and directories
+
+
+- File times
+This paper shows that most files are modified between one month and a year ago, but about 20% are modified within the last month.
+
+- On-disk layout
+The behavior and characteristics of magnetic disks continue to be a dominant concern in storage system.
+> It argues that it is not true that file system performance changes over time, largely due to fragmentation.
+> Because of the defragmenter in modern operating system
+
+### Implementation and Evaluation
+- UBC data set
+This data set contains scans of 857 file systems hosted on 597 computers
+> windows, windows vista, windows server
+
+## 2. Strength (Contributions of the paper)
+1. this paper leverages a two-pass to filter out those chunks which only appears once.
+
+2. The main contribution of this work is it also tracks file system fragmentation and data placement, which hash not been analyzed previously or at large scale.
+> file system data
+> metadata
+> data layout out
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
1. This paper follows the manner that show the CDF and histogram in the same picture, which is a good to show the data distribution.
\ No newline at end of file
diff --git a/StoragePaperNote/Deduplication/Workload-Analysis/SimRedundancy-TC'18.md b/StoragePaperNote/Deduplication/Workload-Analysis/SimRedundancy-TC'18.md
old mode 100644
new mode 100755
index 6aee501..90ce24b
--- a/StoragePaperNote/Deduplication/Workload-Analysis/SimRedundancy-TC'18.md
+++ b/StoragePaperNote/Deduplication/Workload-Analysis/SimRedundancy-TC'18.md
@@ -1,117 +1,117 @@
----
-typora-copy-images-to: paper_figure
----
-A Simulation Analysis of Redundancy and Reliability in Primary Storage Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| TC'18 | Deduplication Reliability |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Despite the wide adoption of deduplication, how deduplication affects storage system reliability remains debatable when compared without deduplication.
-> mitigate the possibility of data loss by reducing storage footprints.
-> amplifies the severity of each data loss event, which may corrupt multiple chunks or files that share the same lost data.
-
-- State-of-art countermeasures:
-> add redundancy via replication or erasure coding to post-deduplication data.
-> propose quantitative methods to evaluate deduplication storage reliability.
-
-- two key open reliability issues:
-> loss variations: different failures (device failures, latent sector errors), different granularities of storage (chunk, files)
-> repair strategies: repair strategies determine whether import data copies are repaired first (affect reliability in different ways)
-
-- Main motivation
-The importance of analyzing and comparing storage system reliability with and without deduplication.
-
-### Method Name
-- Choose dataset
-1. FSL:
-pick nine random snapshots with raw size at least 100GB each
-> Mac OS X server:
-> user011 - user026 (eight users): taken from different users' home directories with various types of files.
-> 
-
-2. MS:
-collected at Microsoft and publicized on SNIA. Focus on a total 903 file system snapshots taht are collected in a single week
-> time: September 18, 2009
-> the average chunk size: 8KB
-
-Also, consider the notion of a *deduplication domain* (a set of file system snapshots over which it performs deduplication).
-> deduplication domain size specifics the number of file system snapshots included in a deduplication domain
-> generate 10 random deduplication domains for each deduplication domain size.
-
-
-- Redundancy analysis
-1. Reference counts
-Intuition: the important of a chunk is **proportional to its reference count**.
-> the majority of chunks have small reference counts. (e.g., referenced by exactly once)
-> losing the highly referenced chunks may lead to severe loss of information as well as high deviations in the reliability simulations.
-
-2. How to determine similar files?
-> 1. share the same minimum chunk fingerprint (Minhash, Broder's theorem)
-> 2. share the same maximum chunk fingerprint
-> 3. have the same extension (provide an additional indicator if the two files are similar)
-
-FSL shows significant fractions of intra-file redundancy.
-The most common redundancy source are duplicate files
-> whole-file deduplication is effective.
-
-- Simulation framework
-Design a framework which analyzes and compares storage system reliability with and without deduplication. Need to configure those factors:
-> 1. failure patterns
-> 2. metadata
-> 3. data layout
-
-
-
-
-- Deliberate copy technique
-Based on the key observation:
-> Highly referenced chunks only account for a small fraction of physical capacity after deduplication
-> the chunk reference counts show a long-tailed distribution
-> It is possible to allocate a small dedicated physical area for storing extra copies of highly refenced chunks
-
-Solution:
-1. allocate the first 1% of physical sectors for the highly referenced chunks
-2. sort the chunks by their reference counts, and fill the dedicated sectors with the top 1% most highly referenced chunks.
-> do this process offline (no need to change the write/read path)
-> incur moderate stroage overhead
-
-
-### Implementation and Evaluation
-- Evaluation
-1. observation-1
-Deduplication will significantly alter the expected amounts of corrupted chunks by USEs when compared to without deduplication.
-
-2. observation-2
-The logical repair progress is affected by the placement of highly referenced chunks and the severity of chunk fragmentation.
-
-3. observation-3
-If it does not carefully place highly referenced chunks and repair them preferentially, deduplication can lead to more corrupted chunks in the presence of UDFs.
-
-
-## 2. Strength (Contributions of the paper)
-1. This paper studies the redundancy characteristics of the file system snapshots from two aspects:
-> the reference counts of chunks
-> the redundancy sources of duplicate chunks
-> minimum hash is better to determine similar files
-> losing a chunk may not necessarily imply the corruptions of many files
-
-2. propose a trace-drvien, deduplication-aware simulation framework to analyze and compare storage system reliability with and without deduplication.
-
-3. apply this simulation framework and get some key findings of its reliability analysis.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-- This paper mentions
-Highly referenced chunks occupy a large fraction of logical capacity, but only a small fraction of physical capacity after deduplication.
-> skew distribution.
-> can assign a small dedicated physical area (with only 1% of physical capacity) for the most referenced chunks and first reparis the physical area to improve them. (incurring only limited storage overhead)
-
-- **Main sources** of duplicate chunks
-> 1. intra-file redundancy
-> 2. duplicate files
+---
+typora-copy-images-to: paper_figure
+---
+A Simulation Analysis of Redundancy and Reliability in Primary Storage Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| TC'18 | Deduplication Reliability |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Despite the wide adoption of deduplication, how deduplication affects storage system reliability remains debatable when compared without deduplication.
+> mitigate the possibility of data loss by reducing storage footprints.
+> amplifies the severity of each data loss event, which may corrupt multiple chunks or files that share the same lost data.
+
+- State-of-art countermeasures:
+> add redundancy via replication or erasure coding to post-deduplication data.
+> propose quantitative methods to evaluate deduplication storage reliability.
+
+- two key open reliability issues:
+> loss variations: different failures (device failures, latent sector errors), different granularities of storage (chunk, files)
+> repair strategies: repair strategies determine whether import data copies are repaired first (affect reliability in different ways)
+
+- Main motivation
+The importance of analyzing and comparing storage system reliability with and without deduplication.
+
+### Method Name
+- Choose dataset
+1. FSL:
+pick nine random snapshots with raw size at least 100GB each
+> Mac OS X server:
+> user011 - user026 (eight users): taken from different users' home directories with various types of files.
+> 
+
+2. MS:
+collected at Microsoft and publicized on SNIA. Focus on a total 903 file system snapshots taht are collected in a single week
+> time: September 18, 2009
+> the average chunk size: 8KB
+
+Also, consider the notion of a *deduplication domain* (a set of file system snapshots over which it performs deduplication).
+> deduplication domain size specifics the number of file system snapshots included in a deduplication domain
+> generate 10 random deduplication domains for each deduplication domain size.
+
+
+- Redundancy analysis
+1. Reference counts
+Intuition: the important of a chunk is **proportional to its reference count**.
+> the majority of chunks have small reference counts. (e.g., referenced by exactly once)
+> losing the highly referenced chunks may lead to severe loss of information as well as high deviations in the reliability simulations.
+
+2. How to determine similar files?
+> 1. share the same minimum chunk fingerprint (Minhash, Broder's theorem)
+> 2. share the same maximum chunk fingerprint
+> 3. have the same extension (provide an additional indicator if the two files are similar)
+
+FSL shows significant fractions of intra-file redundancy.
+The most common redundancy source are duplicate files
+> whole-file deduplication is effective.
+
+- Simulation framework
+Design a framework which analyzes and compares storage system reliability with and without deduplication. Need to configure those factors:
+> 1. failure patterns
+> 2. metadata
+> 3. data layout
+
+
+
+
+- Deliberate copy technique
+Based on the key observation:
+> Highly referenced chunks only account for a small fraction of physical capacity after deduplication
+> the chunk reference counts show a long-tailed distribution
+> It is possible to allocate a small dedicated physical area for storing extra copies of highly refenced chunks
+
+Solution:
+1. allocate the first 1% of physical sectors for the highly referenced chunks
+2. sort the chunks by their reference counts, and fill the dedicated sectors with the top 1% most highly referenced chunks.
+> do this process offline (no need to change the write/read path)
+> incur moderate stroage overhead
+
+
+### Implementation and Evaluation
+- Evaluation
+1. observation-1
+Deduplication will significantly alter the expected amounts of corrupted chunks by USEs when compared to without deduplication.
+
+2. observation-2
+The logical repair progress is affected by the placement of highly referenced chunks and the severity of chunk fragmentation.
+
+3. observation-3
+If it does not carefully place highly referenced chunks and repair them preferentially, deduplication can lead to more corrupted chunks in the presence of UDFs.
+
+
+## 2. Strength (Contributions of the paper)
+1. This paper studies the redundancy characteristics of the file system snapshots from two aspects:
+> the reference counts of chunks
+> the redundancy sources of duplicate chunks
+> minimum hash is better to determine similar files
+> losing a chunk may not necessarily imply the corruptions of many files
+
+2. propose a trace-drvien, deduplication-aware simulation framework to analyze and compare storage system reliability with and without deduplication.
+
+3. apply this simulation framework and get some key findings of its reliability analysis.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+- This paper mentions
+Highly referenced chunks occupy a large fraction of logical capacity, but only a small fraction of physical capacity after deduplication.
+> skew distribution.
+> can assign a small dedicated physical area (with only 1% of physical capacity) for the most referenced chunks and first reparis the physical area to improve them. (incurring only limited storage overhead)
+
+- **Main sources** of duplicate chunks
+> 1. intra-file redundancy
+> 2. duplicate files
> 3. similar files
\ No newline at end of file
diff --git a/StoragePaperNote/DeltaCompression-FAST'12.md b/StoragePaperNote/DeltaCompression-FAST'12.md
old mode 100644
new mode 100755
index 29fef07..5244cd2
--- a/StoragePaperNote/DeltaCompression-FAST'12.md
+++ b/StoragePaperNote/DeltaCompression-FAST'12.md
@@ -1,34 +1,34 @@
----
-typora-copy-images-to: ../paper_figure
----
-WAN Optimized Replication of Backup Datasets Using Stream-Informed Delta Compression
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'12 | Delta compression |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-Improved compression is needed to make WAN replication practical.
-> deduplication + delta compression
-
-It proposes a new method which not only eliminates duplicate regions of files (deduplication) but also compresses *similar* regions of files with **delta compression**.
-
-
-
-
-### Method Name
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. New architecture:
-adds *stream-informed* delta compression to already existing deduplication systems and eliminates the need for new, persistent indexes.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
+---
+typora-copy-images-to: ../paper_figure
+---
+WAN Optimized Replication of Backup Datasets Using Stream-Informed Delta Compression
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'12 | Delta compression |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+Improved compression is needed to make WAN replication practical.
+> deduplication + delta compression
+
+It proposes a new method which not only eliminates duplicate regions of files (deduplication) but also compresses *similar* regions of files with **delta compression**.
+
+
+
+
+### Method Name
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. New architecture:
+adds *stream-informed* delta compression to already existing deduplication systems and eliminates the need for new, persistent indexes.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
diff --git a/StoragePaperNote/EnclaveDB-S&P'18.md b/StoragePaperNote/EnclaveDB-S&P'18.md
old mode 100644
new mode 100755
index 07ce31b..8848fd4
--- a/StoragePaperNote/EnclaveDB-S&P'18.md
+++ b/StoragePaperNote/EnclaveDB-S&P'18.md
@@ -1,32 +1,32 @@
----
-typora-copy-images-to: ../paper_figure
----
-EnclaveDB: A Secure Database using SGX
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| S&P'18 | SGX |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-1. Semantically secure encryption can provide strong and efficient protection for data at rest and in transit
-> but this is not sufficient since data processing systems decrypt sensitive data in memory during query processing
-
-2. Other systems use property-preserving encryption to allow query processing on encrypted data.
-> but suffers from limited querying capabilities and is prone to information leakage.
-
-
-
-
-### Method Name
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
+---
+typora-copy-images-to: ../paper_figure
+---
+EnclaveDB: A Secure Database using SGX
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| S&P'18 | SGX |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+1. Semantically secure encryption can provide strong and efficient protection for data at rest and in transit
+> but this is not sufficient since data processing systems decrypt sensitive data in memory during query processing
+
+2. Other systems use property-preserving encryption to allow query processing on encrypted data.
+> but suffers from limited querying capabilities and is prone to information leakage.
+
+
+
+
+### Method Name
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
diff --git a/StoragePaperNote/ErasureCoding/Background/Availability-OSDI'10.md b/StoragePaperNote/ErasureCoding/Background/Availability-OSDI'10.md
old mode 100644
new mode 100755
index c99894c..d304413
--- a/StoragePaperNote/ErasureCoding/Background/Availability-OSDI'10.md
+++ b/StoragePaperNote/ErasureCoding/Background/Availability-OSDI'10.md
@@ -1,46 +1,46 @@
----
-typora-copy-images-to: paper_figure
----
-# Availability in Globally Distributed Storage System
-@OSDI'10 @Background
-[TOC]
-
-## Summary
-
-***Availability***
-**Part 1**
-
-The vast majority of such unavailability events are transient and do not result in permanent data loss. Less than 10% of events last longer than 15 minutes.
-
-- Two metrics throughout this paper:
-
-1. **The average availability** of all $N$ nodes in a cell
-$$
-A_N = \frac{\sum_{N_i \in N}uptime(N_i)}{\sum_{N_i \in N}(uptime(N_i)+downtime(N_i))}
-$$
-$uptime(N_i)$ and $downtime(N_i)$ refer to the lengths of time a node $N_i$ is available or unavailable.
-
-2. **Mean time to failure (MTTF)**:
-the measurements of availability
-$$
-MTTF = \frac{uptime}{number \quad failures}
-$$
-
-**Failure Bursts**
-This paper presents a simple time-window-based method to group failure events into failure bursts which, despite its simplicity, successfully identifies bursts with a common cause.
-
-**Key findings from the fleetwide data**
-1. Most unavailability periods are transient
-2. Unavailability durations differ significantly by cause
-> Median duration between 1 and 30 minutes depending on the cause
-
-3. Correlated failures matter
-> 37% of all failures of at least 15 minutes in duration in Google datacenters are part of a correlated failure burst.
-
-4. Largest correlated failures share common failure domains
-> Large failure bursts almost always have significant **rack-corelation**.
-
-## Feedback and Recommendations
-1. Determining the acceptable rate of successful transfers to battery power for individual machines upon a power outage.
-2. Focusing on reducing reboot times, because planned kernel upgrades are major source of related failures.
-3. Moving towards a dynamic delay before initiating recoveries, based on failure classification and recent history of failures on the cell.
+---
+typora-copy-images-to: paper_figure
+---
+# Availability in Globally Distributed Storage System
+@OSDI'10 @Background
+[TOC]
+
+## Summary
+
+***Availability***
+**Part 1**
+
+The vast majority of such unavailability events are transient and do not result in permanent data loss. Less than 10% of events last longer than 15 minutes.
+
+- Two metrics throughout this paper:
+
+1. **The average availability** of all $N$ nodes in a cell
+$$
+A_N = \frac{\sum_{N_i \in N}uptime(N_i)}{\sum_{N_i \in N}(uptime(N_i)+downtime(N_i))}
+$$
+$uptime(N_i)$ and $downtime(N_i)$ refer to the lengths of time a node $N_i$ is available or unavailable.
+
+2. **Mean time to failure (MTTF)**:
+the measurements of availability
+$$
+MTTF = \frac{uptime}{number \quad failures}
+$$
+
+**Failure Bursts**
+This paper presents a simple time-window-based method to group failure events into failure bursts which, despite its simplicity, successfully identifies bursts with a common cause.
+
+**Key findings from the fleetwide data**
+1. Most unavailability periods are transient
+2. Unavailability durations differ significantly by cause
+> Median duration between 1 and 30 minutes depending on the cause
+
+3. Correlated failures matter
+> 37% of all failures of at least 15 minutes in duration in Google datacenters are part of a correlated failure burst.
+
+4. Largest correlated failures share common failure domains
+> Large failure bursts almost always have significant **rack-corelation**.
+
+## Feedback and Recommendations
+1. Determining the acceptable rate of successful transfers to battery power for individual machines upon a power outage.
+2. Focusing on reducing reboot times, because planned kernel upgrades are major source of related failures.
+3. Moving towards a dynamic delay before initiating recoveries, based on failure classification and recent history of failures on the cell.
diff --git a/StoragePaperNote/ErasureCoding/Background/Erasure-Code-Summary.md b/StoragePaperNote/ErasureCoding/Background/Erasure-Code-Summary.md
old mode 100644
new mode 100755
index 323d66e..ea07e97
--- a/StoragePaperNote/ErasureCoding/Background/Erasure-Code-Summary.md
+++ b/StoragePaperNote/ErasureCoding/Background/Erasure-Code-Summary.md
@@ -1,55 +1,55 @@
----
-typora-copy-images-to: paper_figure
----
-# Erasure Coding for Cloud Storage Systems: A Survey
-@Survey @2013
-[TOC]
-
-## Summary
-- The key question in cloud storage is: how do it store data reliably with a high efficiency in terms of both storage overhead and data integrity?
-- This paper presents coding techniques into two categories:
->1. Regenerating codes: optimizing bandwidth consumption.
->2. Locally repairable codes: optimizing I/O overhead
-
-- Regenerating codes saves bandwidth by not transferring data that are unnecessary to the particulat newcomer.
-> Most instances of regenerating codes need to ask providers to send linear combinations of their data to the newcomer. It only saves the bandwidth but not disk I/O. Compared with conventional erasure coding, disk I/O will even probably be increased with regenerating codes.
-
-### Erasure Coding and Its Performance Metrics
-1. For $(n,k)$ Maximum Distance Separable (MDS) code, its storage efficiency is at best $\frac{k}{n}$
-2. **Repair Bandwidth**: if encoding operations can be performed both on the newcomer and providers rather than on the newcomer only, **a much smaller** amount of data can be transferred.
-3. **Repair I/O**: writing operations are performed only at the newcomer, and the amount of data written should equal the size of the coded block (**unaviodable**). So people really care is actually the amount of data read from disks of providers. Two ways to save disk I/O:
-> a). obtian specific coded segments from providers, and hence other coded segments in the same provider will not be encoded and will not be read.
-> b). access specific storage nodes as providers, rather than accessing any $k$ storage nodes.
-
-4. **Access Latency**: systematic codes which the original data can be embedded into code blocks, are able to maintian a higher storage efficiency than replcias while achieving a low access latency.
-5. **Storage Efficiency**: storage efficiency denotes the ration of the amount of the original data to the actual amount of data stored on disks. (MDS achieves the best storage efficiency)
-
-### Tradeoff Between Storage $\alpha$ and Bandwidth $r$: Regenerating Codes
-Two special cases of regenerating codes, which correspond to the minimum storage space required at storage nodes and the minimum total bandwidth consumption in the repair, respectively.
-1. **Minimum-Storage Regenerating (MSR) Codes**: $(\alpha_{MSR}, r_{MSR})=(\frac{M}{k},\frac{Md}{k(d-k+1)})$
-2. **Minimum-Bandwidth Regenerating (MBR) Codes**: $(\alpha_{MBR}, r_{MBR})=(\frac{2Md}{k(2d-k+1)},\frac{2Md}{k(2d-k+1)})$
-3. To implement regenerating codes, the simplest way is to use **random linear coding** (significant computational complexity, not ensure that any $k$ code blocks are decodable). Thus, it is necessary to find explicit construction of regenerating codes.
-4. **Repair-by-transfer regenerating codes**: with the repair-by-transfer property, the disk I/O overhead can be minimal since only data needed by the newcomer weill be read from the providers.
-
-### Saving the Disk I/O Overhead: Locally Repairable Codes
-Three representative families of the LRC:
->1. Hierarchical codes
->2. Self-repairing codes
->3. Simple regenerating codes
-
-1. **Hierarchical Codes**: the repair degrees of coded blocks vary from 2 to $k$. An instance of large hierarchical codes can be constructed step by step from instances of smaller hierarchical codes.
-2. **Self-repairing codes**: self-repairing codes can achieve a constant repair degree, independent of any specific missing block.
-3. **Simple regenerating codes**: Though both hierarchical codes and self-repairing codes can achieve a low repair degree, their resiliences to the failures of storage nodes are **probabilistic**. The tolerance against failures and storage overhead of simple regenerating codes becomes predictable.
-4. **Tradeoff between the failures tolerance and the repair degree**: It has been shown that none of the locally repairable codes can preserve the MDS property. There is the tradeoff between repair degree and the minimum distance.
-
-### Trend in the research
-1. the design objective gradually transfers from **data integrity** to **resource overhead**, from the **bandwidth resource** to some other scarcer resource for the cloud storage system, such as **computation** and **disk I/O overhead**.
-2. The tradeoff between the repair degree and storage overhead has not been established clearly.
-3. Given that the cloud storage system scales globally in multiple data centers, bandwidth, computation, and the corresponding geographical heterogeneities should be carefully discussed.
-
-
-
-
-
-
-
+---
+typora-copy-images-to: paper_figure
+---
+# Erasure Coding for Cloud Storage Systems: A Survey
+@Survey @2013
+[TOC]
+
+## Summary
+- The key question in cloud storage is: how do it store data reliably with a high efficiency in terms of both storage overhead and data integrity?
+- This paper presents coding techniques into two categories:
+>1. Regenerating codes: optimizing bandwidth consumption.
+>2. Locally repairable codes: optimizing I/O overhead
+
+- Regenerating codes saves bandwidth by not transferring data that are unnecessary to the particulat newcomer.
+> Most instances of regenerating codes need to ask providers to send linear combinations of their data to the newcomer. It only saves the bandwidth but not disk I/O. Compared with conventional erasure coding, disk I/O will even probably be increased with regenerating codes.
+
+### Erasure Coding and Its Performance Metrics
+1. For $(n,k)$ Maximum Distance Separable (MDS) code, its storage efficiency is at best $\frac{k}{n}$
+2. **Repair Bandwidth**: if encoding operations can be performed both on the newcomer and providers rather than on the newcomer only, **a much smaller** amount of data can be transferred.
+3. **Repair I/O**: writing operations are performed only at the newcomer, and the amount of data written should equal the size of the coded block (**unaviodable**). So people really care is actually the amount of data read from disks of providers. Two ways to save disk I/O:
+> a). obtian specific coded segments from providers, and hence other coded segments in the same provider will not be encoded and will not be read.
+> b). access specific storage nodes as providers, rather than accessing any $k$ storage nodes.
+
+4. **Access Latency**: systematic codes which the original data can be embedded into code blocks, are able to maintian a higher storage efficiency than replcias while achieving a low access latency.
+5. **Storage Efficiency**: storage efficiency denotes the ration of the amount of the original data to the actual amount of data stored on disks. (MDS achieves the best storage efficiency)
+
+### Tradeoff Between Storage $\alpha$ and Bandwidth $r$: Regenerating Codes
+Two special cases of regenerating codes, which correspond to the minimum storage space required at storage nodes and the minimum total bandwidth consumption in the repair, respectively.
+1. **Minimum-Storage Regenerating (MSR) Codes**: $(\alpha_{MSR}, r_{MSR})=(\frac{M}{k},\frac{Md}{k(d-k+1)})$
+2. **Minimum-Bandwidth Regenerating (MBR) Codes**: $(\alpha_{MBR}, r_{MBR})=(\frac{2Md}{k(2d-k+1)},\frac{2Md}{k(2d-k+1)})$
+3. To implement regenerating codes, the simplest way is to use **random linear coding** (significant computational complexity, not ensure that any $k$ code blocks are decodable). Thus, it is necessary to find explicit construction of regenerating codes.
+4. **Repair-by-transfer regenerating codes**: with the repair-by-transfer property, the disk I/O overhead can be minimal since only data needed by the newcomer weill be read from the providers.
+
+### Saving the Disk I/O Overhead: Locally Repairable Codes
+Three representative families of the LRC:
+>1. Hierarchical codes
+>2. Self-repairing codes
+>3. Simple regenerating codes
+
+1. **Hierarchical Codes**: the repair degrees of coded blocks vary from 2 to $k$. An instance of large hierarchical codes can be constructed step by step from instances of smaller hierarchical codes.
+2. **Self-repairing codes**: self-repairing codes can achieve a constant repair degree, independent of any specific missing block.
+3. **Simple regenerating codes**: Though both hierarchical codes and self-repairing codes can achieve a low repair degree, their resiliences to the failures of storage nodes are **probabilistic**. The tolerance against failures and storage overhead of simple regenerating codes becomes predictable.
+4. **Tradeoff between the failures tolerance and the repair degree**: It has been shown that none of the locally repairable codes can preserve the MDS property. There is the tradeoff between repair degree and the minimum distance.
+
+### Trend in the research
+1. the design objective gradually transfers from **data integrity** to **resource overhead**, from the **bandwidth resource** to some other scarcer resource for the cloud storage system, such as **computation** and **disk I/O overhead**.
+2. The tradeoff between the repair degree and storage overhead has not been established clearly.
+3. Given that the cloud storage system scales globally in multiple data centers, bandwidth, computation, and the corresponding geographical heterogeneities should be carefully discussed.
+
+
+
+
+
+
+
diff --git a/StoragePaperNote/ErasureCoding/Background/Facebook Warehouse Cluster-HotStorage'13.md b/StoragePaperNote/ErasureCoding/Background/Facebook Warehouse Cluster-HotStorage'13.md
old mode 100644
new mode 100755
index d4228a7..9139ed8
--- a/StoragePaperNote/ErasureCoding/Background/Facebook Warehouse Cluster-HotStorage'13.md
+++ b/StoragePaperNote/ErasureCoding/Background/Facebook Warehouse Cluster-HotStorage'13.md
@@ -1,21 +1,21 @@
-# A Solution to the Network Challenges of Data Recovery in Erasure-coded Distributed Storage Systems: A study on the Facebook Warehouse Cluster
-@HotStorage'13 @Study on Data-center Network @Note
-[TOC]
-
-## Summary
-***Background from Facebook's warehouse cluster***:
-- The warehouse cluster at Facebook employs an RS code with parameters ($k=10, r=4$), thus resulting in a $1.4 \times$ storage requirement, as compared to $3 \times$ under conventional replication, for a similar level fo reliability.
-- The most frequently accessed data is stored as 3 replicas, to allow for efficient scheduling of the map-reduce jobs.
-- For the data which has not been accessed for more than three months is stored as a $(10,4)$ RS code.
-
-***Data Recovery***
-- In Facebook warehouse cluster, the median is more than 50 machines-unavailability events per day.
-> This reasserts the necessity of redundancy in the data for reliability and availability.
-
-- Number of missing blocks in a stripe:
-> one block missing: 98.08% (most common)
-> two blocks missing: 1.87%
-> three or more blocks missing: 0.05%
-
-- Cross-rack bandwidth consumed:
+# A Solution to the Network Challenges of Data Recovery in Erasure-coded Distributed Storage Systems: A study on the Facebook Warehouse Cluster
+@HotStorage'13 @Study on Data-center Network @Note
+[TOC]
+
+## Summary
+***Background from Facebook's warehouse cluster***:
+- The warehouse cluster at Facebook employs an RS code with parameters ($k=10, r=4$), thus resulting in a $1.4 \times$ storage requirement, as compared to $3 \times$ under conventional replication, for a similar level fo reliability.
+- The most frequently accessed data is stored as 3 replicas, to allow for efficient scheduling of the map-reduce jobs.
+- For the data which has not been accessed for more than three months is stored as a $(10,4)$ RS code.
+
+***Data Recovery***
+- In Facebook warehouse cluster, the median is more than 50 machines-unavailability events per day.
+> This reasserts the necessity of redundancy in the data for reliability and availability.
+
+- Number of missing blocks in a stripe:
+> one block missing: 98.08% (most common)
+> two blocks missing: 1.87%
+> three or more blocks missing: 0.05%
+
+- Cross-rack bandwidth consumed:
> A median of more than 180TB of data is transferred through the TOR switches every day for RS-coded data recovery.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Background/Raft-ATC'14.md b/StoragePaperNote/ErasureCoding/Background/Raft-ATC'14.md
old mode 100644
new mode 100755
index 0c95d44..fba1e59
--- a/StoragePaperNote/ErasureCoding/Background/Raft-ATC'14.md
+++ b/StoragePaperNote/ErasureCoding/Background/Raft-ATC'14.md
@@ -1,90 +1,90 @@
-typora-copy-images-to: paper_figure
-
-In Search of an Understandable Consensus Algorithm
-------------------------------------------
-| Venue | Category |
-| :----: | :-----------------: |
-| ATC'14 | Consensus Algorithm |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Paxos is quite different ro understand. Its architecture requires complex changes to support practical system.
-> Both system builders and students struggle with Paxos.
-
-- Two drawbacks in Paxos
-1. Paxos is exceptionally difficult to understand.
-> why single-decree protocol works?
-
-2. Paxos does not provide a good foundation for building practical implementations.
-
-Paxos does not provide a good foundation either for system building or for education. This paper could provide a better foundation for system building and education. The primary goal is understandability.
-
-### Raft
-- Three key novel features
->1. strong leader: log entries only flow from the leader to other servers.
->> simplify the management of the replicated log
->2. leader election: Raft uses randomized timer to elect leaders
->> only add a small amount of mechanism to the heartbeats
->
->3. membership changes: a new joint consensus approach, the majorities of two different configurations overlap during transitions.
-
-- Replicated State Machine
-It can be used to solve a varity of fault tolerance problems in distributed systems. Replicated state machines are typically inplemented using **a replicated log**.
-> Each server stores a log containing a series of commands
-> The state machines are deterministic
-
-*Goal*: keeping the replicated log **consistent** is the job of the consensus algorithm.
-
-- To provide better understandability:
-1. problem decomposition
->a. leader election: the leader must be chosen when an existing leader fails
->b. log replication
->c. Safety
-
-2. simplify the state space by reducing the number of states to consider.
-
-
-- Leader Election
-1. It uses a heartbeat mechanism to trigger leader election.
-2. If a follower receives no communication over a period of time called the election timeout, then it assumes there is no viable leader and begins an election to choose a new leader.
-3. A candidate wins an election if it receives votes from a majority of the severs in the full cluster for the same term.
-> **Majority**: majority rule ensures that at most one candidate can win the election for a particular term.
-
-
-
-4. To handle split votes?
-> if many followers become candidates at the same time, votes could be split so that no candidate obtains a majority.
-> Solution: use randomized election timeout to ensure that split votes are rare and that they are resolved quickly.
-
-- Log Replication
-1. The leader appends the command to its log as a new entry, then issues **AppendEntries** RPCs in parallel to each of the other servers to replicate the entry.
-> If followers crash or run slowly, or if network packets are lost, the leader retries **AppendEntries** RPCs **indefinitely** (even after the the leader has responded to the client) until all followers eventually store all log entries.
-
-2. The leader decides when it is safe to apply a log entry to the state machines (commited)
-> A log entry has replicated it on a majority of the servers.
-> the leader must find the latest log entry where the two logs agree, delete any entries in the follower's log after that point, and send the follower all of the leader's entries.
-
-- Safety
-1. Adding a restriction on which servers may be elected leader
-> ensure that the leader for any given term contains all of the entries committed in previous terms.
-
-2. Raft uses a simpler approach where it guarantees that all the committed entries from previous terms are present on each new leader from the moment of its election.
-> log entries only flow in one direction, from leaders to followers, and leaders never overwrite existing entries in their log.
-
-- Cluster Membership changes
-Challenge:
-> Not possible to do an atomic switch (change the membership of all servers at one)
-
-1. use a two-phase approach:
-> switch first to a transitional **joint consensus** configuration
-> once the joint consensus has been committed, transition to the new configuration
-
-### Implementation and Evaluation
-
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+typora-copy-images-to: paper_figure
+
+In Search of an Understandable Consensus Algorithm
+------------------------------------------
+| Venue | Category |
+| :----: | :-----------------: |
+| ATC'14 | Consensus Algorithm |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Paxos is quite different ro understand. Its architecture requires complex changes to support practical system.
+> Both system builders and students struggle with Paxos.
+
+- Two drawbacks in Paxos
+1. Paxos is exceptionally difficult to understand.
+> why single-decree protocol works?
+
+2. Paxos does not provide a good foundation for building practical implementations.
+
+Paxos does not provide a good foundation either for system building or for education. This paper could provide a better foundation for system building and education. The primary goal is understandability.
+
+### Raft
+- Three key novel features
+>1. strong leader: log entries only flow from the leader to other servers.
+>> simplify the management of the replicated log
+>2. leader election: Raft uses randomized timer to elect leaders
+>> only add a small amount of mechanism to the heartbeats
+>
+>3. membership changes: a new joint consensus approach, the majorities of two different configurations overlap during transitions.
+
+- Replicated State Machine
+It can be used to solve a varity of fault tolerance problems in distributed systems. Replicated state machines are typically inplemented using **a replicated log**.
+> Each server stores a log containing a series of commands
+> The state machines are deterministic
+
+*Goal*: keeping the replicated log **consistent** is the job of the consensus algorithm.
+
+- To provide better understandability:
+1. problem decomposition
+>a. leader election: the leader must be chosen when an existing leader fails
+>b. log replication
+>c. Safety
+
+2. simplify the state space by reducing the number of states to consider.
+
+
+- Leader Election
+1. It uses a heartbeat mechanism to trigger leader election.
+2. If a follower receives no communication over a period of time called the election timeout, then it assumes there is no viable leader and begins an election to choose a new leader.
+3. A candidate wins an election if it receives votes from a majority of the severs in the full cluster for the same term.
+> **Majority**: majority rule ensures that at most one candidate can win the election for a particular term.
+
+
+
+4. To handle split votes?
+> if many followers become candidates at the same time, votes could be split so that no candidate obtains a majority.
+> Solution: use randomized election timeout to ensure that split votes are rare and that they are resolved quickly.
+
+- Log Replication
+1. The leader appends the command to its log as a new entry, then issues **AppendEntries** RPCs in parallel to each of the other servers to replicate the entry.
+> If followers crash or run slowly, or if network packets are lost, the leader retries **AppendEntries** RPCs **indefinitely** (even after the the leader has responded to the client) until all followers eventually store all log entries.
+
+2. The leader decides when it is safe to apply a log entry to the state machines (commited)
+> A log entry has replicated it on a majority of the servers.
+> the leader must find the latest log entry where the two logs agree, delete any entries in the follower's log after that point, and send the follower all of the leader's entries.
+
+- Safety
+1. Adding a restriction on which servers may be elected leader
+> ensure that the leader for any given term contains all of the entries committed in previous terms.
+
+2. Raft uses a simpler approach where it guarantees that all the committed entries from previous terms are present on each new leader from the moment of its election.
+> log entries only flow in one direction, from leaders to followers, and leaders never overwrite existing entries in their log.
+
+- Cluster Membership changes
+Challenge:
+> Not possible to do an atomic switch (change the membership of all servers at one)
+
+1. use a two-phase approach:
+> switch first to a transitional **joint consensus** configuration
+> once the joint consensus has been committed, transition to the new configuration
+
+### Implementation and Evaluation
+
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/ErasureCoding/Comparison/LRC-Comparison-ATC'18.md b/StoragePaperNote/ErasureCoding/Comparison/LRC-Comparison-ATC'18.md
old mode 100644
new mode 100755
index dfea1de..08f1888
--- a/StoragePaperNote/ErasureCoding/Comparison/LRC-Comparison-ATC'18.md
+++ b/StoragePaperNote/ErasureCoding/Comparison/LRC-Comparison-ATC'18.md
@@ -1,57 +1,57 @@
----
-typora-copy-images-to: paper_figure
----
-# On Fault Tolerance, Locality, and Optimality in Locally Repairable Codes
-@ATC'18 @LRC
-[TOC]
-
-## Summary
-***Motivation of this paper***: Existing theoretical models cannot be used to directly compare different LRCs to determine which code will offer the best recovery performance, and at what cost. Thus, this paper performs the first systematic comparison of existing LRC approaches.
-The goal of this paper: Lay mathematical basis for comparison
->1. Define parameters for comparison
->2. Compare codes in all sets of configurations, extend LRCs
->3. Compare for $9 \leq n \leq 19$ and overhead $\leq 2$
->4. Validate in a real system
-
-***Methodology***:
-- For full-LRCs and data-LRCs $(n, k, r)$
->1. Full-LRCs: all the blocks, including the global parities, can be repaired locally from $r$ surviving blocks.
->2. Data-LRCs: Only the data blocks can be repaired in local fashion by $r$ surviving blocks, while the global parities require $k$ blocks for recovery.
-
-This paper provides a framework for directly comparing all codes in all parameter combinations.
-- the minimal distance $d$ is more appropriate for large-scale analysis, instead of MTTDL
-- Average Repair Cost (ARC)
-$$
-ARC=\frac{\sum^n_{i=1}cost(b_i)}{n}
-$$
-
-- Normalized Repair Cost (NRC), it can be viewed as the average cost of repairing a failed data block, where the cost of repairing the parity blocks is amortized over the $k$ data blocks.
-$$
-NRC=ARC \times {\frac{n}{k}}=\frac{\sum^n_{i=1}cost(b_i)}{k}
-$$
-
-- Degraded Cost: average cost of repairing data blocks only:
-$$
-Degraded Cost=\frac{\sum^k_{i=1}cost(b_i)}{k}
-$$
-This paper compares Azure-LRC, Xorbas, Reed-Solomon in those aspects
-***Implementation and Evaluation***:
-- This paper follows the theoretical analysis with an evaluation of those codes in a **Ceph** cluster deployed in AWS EC2. It uses the **Jeasure Erasure Code plugin** and **Locally Repairable Erasure Code plugin** to implement Azure-LRC and Azure-LRC+1
-- For the generator matrix of Optimal-LRC: it uses the **Matlab** to do the construction.
-
-**Evaluation**:
-1. Amount of data read and transferred
-2. Repair time
-3. Foreground Workloads: RADOS Bench, writes objects for a given amount of time, reads all the objects, and terminates.
-The evaluation result of Ceph shows the benefit of full-LRCs and data-LRCs depends on the underlying storage devices, network topology, and foreground application load.
-
-## Strength (Contributions of the paper)
-1. This paper conducts a theoretical comparison between the different LRC approaches and demonstrates the limitations of existing measures, such as locality and average repair cost.
-2. **Optimal-LRC implementation**: It defines new metrics that model each code's overhead, full-node repair cost, degraded read cost, and fault tolerance.
-3. **Amazon EC2 deployment**: This paper deploys its Ceph cluster on 20 instances in the Amazon EC2.
-## Weakness (Limitations of the paper)
-1. This paper just does the comparison between various LRCs, which is not very novel.
-2. Its comparative framework of using NRC is not very novel.
-## Future Works
-1. I think one point can be extended is to refine its comparative framework. Currently, just give a new definiation in term of *Noramlized Repair Cost* is not enough.
-
+---
+typora-copy-images-to: paper_figure
+---
+# On Fault Tolerance, Locality, and Optimality in Locally Repairable Codes
+@ATC'18 @LRC
+[TOC]
+
+## Summary
+***Motivation of this paper***: Existing theoretical models cannot be used to directly compare different LRCs to determine which code will offer the best recovery performance, and at what cost. Thus, this paper performs the first systematic comparison of existing LRC approaches.
+The goal of this paper: Lay mathematical basis for comparison
+>1. Define parameters for comparison
+>2. Compare codes in all sets of configurations, extend LRCs
+>3. Compare for $9 \leq n \leq 19$ and overhead $\leq 2$
+>4. Validate in a real system
+
+***Methodology***:
+- For full-LRCs and data-LRCs $(n, k, r)$
+>1. Full-LRCs: all the blocks, including the global parities, can be repaired locally from $r$ surviving blocks.
+>2. Data-LRCs: Only the data blocks can be repaired in local fashion by $r$ surviving blocks, while the global parities require $k$ blocks for recovery.
+
+This paper provides a framework for directly comparing all codes in all parameter combinations.
+- the minimal distance $d$ is more appropriate for large-scale analysis, instead of MTTDL
+- Average Repair Cost (ARC)
+$$
+ARC=\frac{\sum^n_{i=1}cost(b_i)}{n}
+$$
+
+- Normalized Repair Cost (NRC), it can be viewed as the average cost of repairing a failed data block, where the cost of repairing the parity blocks is amortized over the $k$ data blocks.
+$$
+NRC=ARC \times {\frac{n}{k}}=\frac{\sum^n_{i=1}cost(b_i)}{k}
+$$
+
+- Degraded Cost: average cost of repairing data blocks only:
+$$
+Degraded Cost=\frac{\sum^k_{i=1}cost(b_i)}{k}
+$$
+This paper compares Azure-LRC, Xorbas, Reed-Solomon in those aspects
+***Implementation and Evaluation***:
+- This paper follows the theoretical analysis with an evaluation of those codes in a **Ceph** cluster deployed in AWS EC2. It uses the **Jeasure Erasure Code plugin** and **Locally Repairable Erasure Code plugin** to implement Azure-LRC and Azure-LRC+1
+- For the generator matrix of Optimal-LRC: it uses the **Matlab** to do the construction.
+
+**Evaluation**:
+1. Amount of data read and transferred
+2. Repair time
+3. Foreground Workloads: RADOS Bench, writes objects for a given amount of time, reads all the objects, and terminates.
+The evaluation result of Ceph shows the benefit of full-LRCs and data-LRCs depends on the underlying storage devices, network topology, and foreground application load.
+
+## Strength (Contributions of the paper)
+1. This paper conducts a theoretical comparison between the different LRC approaches and demonstrates the limitations of existing measures, such as locality and average repair cost.
+2. **Optimal-LRC implementation**: It defines new metrics that model each code's overhead, full-node repair cost, degraded read cost, and fault tolerance.
+3. **Amazon EC2 deployment**: This paper deploys its Ceph cluster on 20 instances in the Amazon EC2.
+## Weakness (Limitations of the paper)
+1. This paper just does the comparison between various LRCs, which is not very novel.
+2. Its comparative framework of using NRC is not very novel.
+## Future Works
+1. I think one point can be extended is to refine its comparative framework. Currently, just give a new definiation in term of *Noramlized Repair Cost* is not enough.
+
diff --git a/StoragePaperNote/ErasureCoding/EC Update/CAU-ICPP'18.md b/StoragePaperNote/ErasureCoding/EC Update/CAU-ICPP'18.md
old mode 100644
new mode 100755
index 4e27584..2020c47
--- a/StoragePaperNote/ErasureCoding/EC Update/CAU-ICPP'18.md
+++ b/StoragePaperNote/ErasureCoding/EC Update/CAU-ICPP'18.md
@@ -1,59 +1,59 @@
----
-typora-copy-images-to: paper_figure
----
-# Cross-Rack-Aware Updates in Erasure-Coded Data Centers
-@ICPP'18 @Cross-Rack-Aware @Parity Updates
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-- EC updates are very common in today's DC storage workloads (*update-intensive workloads*)
-> Frequent small-size updates in turn lead to instensive parity updates in erasure-coded DCs
-> **Hierachical topological nature** of DCs makes the cross-rack bandwidth often oversubscribed, and much more scarce thatn the inner-rack bandwidth.
-
-- This paper tries to distribute a stripe ($RS(n, k)$) across $r 1. it leverages the **append-commit procedure**, without immediately updating the associated parity chunk.
-> 2. it appends the new data chunk to an **append-only** log, and executes the update in commit phase. (Note: there is no **redundancy** of this new update chunk $\rightarrow$ **interim replication**)
-> 3. **Interim replication**: currently stores one replica for each newly updates data chunk in a different rack *temporarily*. (until it performs parity updates in the commit phase)
->
-> 
-
-- Change the delta-based update to **selective parity updates**
-> 
->
-> 1. The difference of those two method is where to compute the change of a parity chunk (the place where the parity chunk store, or the place where the data chunk store). (Note: it is not the theorically minimum cross-rack update traffic, because it ignores the case of some special erasure code)
-
-- Data Grouping (based on the **spatial locality in updates**)
-> 1. process each stripe independently (rather than multiple stripes), and only select two racks for data grouping.
-> 2. **swap** (presever fault tolerence) updated data chunks and non-updated data chunks, reallocate the updated data chunks in the same rack, to mitigate the cross-rack update traffic. (Here is an algorithm to find the maximum gain of various swap schedules)
-> 
-
-### Implementation and Evaluation
-- Implementation
-
-
-- Evaluation
-> 1. Compare with baseline and PARIX.
->> Recovery throughput, impact of non-buffered I/O, impact of gateway bandwidth, import of append phase time.
-> 2. To simulating a hierarchical DC, it uses a node to act as gateway. And use TC to limit the gateway bandwidth. so as to minic the over-subscription scenario.
-
-## 2. Strength (Contributions of the paper)
-- This paper presets CAU to mitigate the cross-rack update traffic through two key points
-> 1. selective parity updates
-> 2. data grouping
-
-
-- Leverage the interim replication to maintain reliability, and do reliability analysis to prove it can work.
-
-- Implement the prototype of CAU (**open-source**), and evaluates the CAU under real-world workloads. (including trace-driven analysis, local cluster experiments, Amazon EC2 experiments)
-## 3. Weakness (Limitations of the paper)
-- In its data grouping process, it swapes chunks of different racks, which also needs to updates the metadata of metadata server.
-- In part of reliability analysis, its model is very simple, and does not consider the cases of different, I feel it is not very convincible.
-
-## 4. Future Works
-- 1. it can further leverge some features of specifal erasure code to further improve its performance. For example, it may not require to update all parity blocks in some special erasure code.
-- 2. In its data grouping, it just consider swap the block between two rack, if it extends the case to multiple racks, can it gain further improvement?
+---
+typora-copy-images-to: paper_figure
+---
+# Cross-Rack-Aware Updates in Erasure-Coded Data Centers
+@ICPP'18 @Cross-Rack-Aware @Parity Updates
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+- EC updates are very common in today's DC storage workloads (*update-intensive workloads*)
+> Frequent small-size updates in turn lead to instensive parity updates in erasure-coded DCs
+> **Hierachical topological nature** of DCs makes the cross-rack bandwidth often oversubscribed, and much more scarce thatn the inner-rack bandwidth.
+
+- This paper tries to distribute a stripe ($RS(n, k)$) across $r 1. it leverages the **append-commit procedure**, without immediately updating the associated parity chunk.
+> 2. it appends the new data chunk to an **append-only** log, and executes the update in commit phase. (Note: there is no **redundancy** of this new update chunk $\rightarrow$ **interim replication**)
+> 3. **Interim replication**: currently stores one replica for each newly updates data chunk in a different rack *temporarily*. (until it performs parity updates in the commit phase)
+>
+> 
+
+- Change the delta-based update to **selective parity updates**
+> 
+>
+> 1. The difference of those two method is where to compute the change of a parity chunk (the place where the parity chunk store, or the place where the data chunk store). (Note: it is not the theorically minimum cross-rack update traffic, because it ignores the case of some special erasure code)
+
+- Data Grouping (based on the **spatial locality in updates**)
+> 1. process each stripe independently (rather than multiple stripes), and only select two racks for data grouping.
+> 2. **swap** (presever fault tolerence) updated data chunks and non-updated data chunks, reallocate the updated data chunks in the same rack, to mitigate the cross-rack update traffic. (Here is an algorithm to find the maximum gain of various swap schedules)
+> 
+
+### Implementation and Evaluation
+- Implementation
+
+
+- Evaluation
+> 1. Compare with baseline and PARIX.
+>> Recovery throughput, impact of non-buffered I/O, impact of gateway bandwidth, import of append phase time.
+> 2. To simulating a hierarchical DC, it uses a node to act as gateway. And use TC to limit the gateway bandwidth. so as to minic the over-subscription scenario.
+
+## 2. Strength (Contributions of the paper)
+- This paper presets CAU to mitigate the cross-rack update traffic through two key points
+> 1. selective parity updates
+> 2. data grouping
+
+
+- Leverage the interim replication to maintain reliability, and do reliability analysis to prove it can work.
+
+- Implement the prototype of CAU (**open-source**), and evaluates the CAU under real-world workloads. (including trace-driven analysis, local cluster experiments, Amazon EC2 experiments)
+## 3. Weakness (Limitations of the paper)
+- In its data grouping process, it swapes chunks of different racks, which also needs to updates the metadata of metadata server.
+- In part of reliability analysis, its model is very simple, and does not consider the cases of different, I feel it is not very convincible.
+
+## 4. Future Works
+- 1. it can further leverge some features of specifal erasure code to further improve its performance. For example, it may not require to update all parity blocks in some special erasure code.
+- 2. In its data grouping, it just consider swap the block between two rack, if it extends the case to multiple racks, can it gain further improvement?
diff --git a/StoragePaperNote/ErasureCoding/EC Update/ParIX-ATC'17.md b/StoragePaperNote/ErasureCoding/EC Update/ParIX-ATC'17.md
old mode 100644
new mode 100755
index e21d737..30326e0
--- a/StoragePaperNote/ErasureCoding/EC Update/ParIX-ATC'17.md
+++ b/StoragePaperNote/ErasureCoding/EC Update/ParIX-ATC'17.md
@@ -1,49 +1,49 @@
----
-typora-copy-images-to: paper_figure
----
-PARIX: Speculative Partial Writes in Erasure-Coded Systems
---------------------------------
-
-@ATC'17 @Erasure Coded writes
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-- This paper aims to solve the problem of partial write in EC. The main problem of partial write of EC is the high I/O amplification. Although this can can be mitigate by leveaging parity logging to append incremental change log (update the delta of parity), it agrues the cost of **in-place read-and-write** is still expensive (its latency is equivalent to that of random seek).
-
-- This paper wants to further optimize parity logging by eliminating the read. (i.e., the key to improve the performance of partial writes is to reduce the number of reads)
-
-### PariX
-- The key idea of PariX
-> 1. Each parity records $m$ series of changes logs of $m$ data blocks.
-> > The rationale: $d^{(0)}_i \leftarrow d^{(1)}_i, d^{(2)}_i, d^{(3)}_i, ... ,d^{(r)}_i$,
-> > $p_j^{(r)} = p_j^{(0)} + a_{ij} \times (d_i^{(r)} - d_i^{(0)})$
-> 2. The Speculation:
-> > In this paper, its speculation means whether the parities need $d_i^{(0)}$ or not.
-> > it speculates that
-> > 1) Assume $d_i^{(0)}$ is NOT needed (mostly right)
-> > 2) Send $d_i^{(0)}$ only when it actually needed (sometimes only)
-
-- PariX's overhead is a disk write, while previous parity log schemes's overhead is **a disk write after a disk read**, assuming the read cache is missed.
-
-- The whole workflow of PariX:
-
- 
-
-**Note that**: For the case in replication, it is also (1 loop + 1 RTT)
-
-### Implementation and Evaluation
-- Based on a previous block store of this group, master server is implemented by **MySQL** and **Redis**. (refer to Usra (Open Source partly))
-- Random I/O lantency
-- Recovery time
-
-## 2. Strength (Contributions of the paper)
-- the idea of this paper is not very complex, but can really work. Reducing the time of read is an intitutive method to impove the performance of writing.
-- the method can truly work in the scenario of
-## 3. Weakness (Limitations of the paper)
-- When there are too many one-shot write in a workload, PariX cannot work well. Because it cannot fit its speculation.
-- When PariX speculation fails, it needs to send $d_i^{(0)}$ to all parities, this penatly would introduce high network traffic overhead. (Although it argues that parities writes are small)
-- In its evaluation, it just test this prototype in a small testbed (with 10 machines), with EC (4, 2). This setting cannot prove the scalability of it.
-
-## 4. Future Works
-- In this method, its speculation is very simple, which cannot fit the one-shot write well. How to handle the large sequential one-shot write workload? Maybe it is a point can be extended.
+---
+typora-copy-images-to: paper_figure
+---
+PARIX: Speculative Partial Writes in Erasure-Coded Systems
+--------------------------------
+
+@ATC'17 @Erasure Coded writes
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+- This paper aims to solve the problem of partial write in EC. The main problem of partial write of EC is the high I/O amplification. Although this can can be mitigate by leveaging parity logging to append incremental change log (update the delta of parity), it agrues the cost of **in-place read-and-write** is still expensive (its latency is equivalent to that of random seek).
+
+- This paper wants to further optimize parity logging by eliminating the read. (i.e., the key to improve the performance of partial writes is to reduce the number of reads)
+
+### PariX
+- The key idea of PariX
+> 1. Each parity records $m$ series of changes logs of $m$ data blocks.
+> > The rationale: $d^{(0)}_i \leftarrow d^{(1)}_i, d^{(2)}_i, d^{(3)}_i, ... ,d^{(r)}_i$,
+> > $p_j^{(r)} = p_j^{(0)} + a_{ij} \times (d_i^{(r)} - d_i^{(0)})$
+> 2. The Speculation:
+> > In this paper, its speculation means whether the parities need $d_i^{(0)}$ or not.
+> > it speculates that
+> > 1) Assume $d_i^{(0)}$ is NOT needed (mostly right)
+> > 2) Send $d_i^{(0)}$ only when it actually needed (sometimes only)
+
+- PariX's overhead is a disk write, while previous parity log schemes's overhead is **a disk write after a disk read**, assuming the read cache is missed.
+
+- The whole workflow of PariX:
+
+ 
+
+**Note that**: For the case in replication, it is also (1 loop + 1 RTT)
+
+### Implementation and Evaluation
+- Based on a previous block store of this group, master server is implemented by **MySQL** and **Redis**. (refer to Usra (Open Source partly))
+- Random I/O lantency
+- Recovery time
+
+## 2. Strength (Contributions of the paper)
+- the idea of this paper is not very complex, but can really work. Reducing the time of read is an intitutive method to impove the performance of writing.
+- the method can truly work in the scenario of
+## 3. Weakness (Limitations of the paper)
+- When there are too many one-shot write in a workload, PariX cannot work well. Because it cannot fit its speculation.
+- When PariX speculation fails, it needs to send $d_i^{(0)}$ to all parities, this penatly would introduce high network traffic overhead. (Although it argues that parities writes are small)
+- In its evaluation, it just test this prototype in a small testbed (with 10 machines), with EC (4, 2). This setting cannot prove the scalability of it.
+
+## 4. Future Works
+- In this method, its speculation is very simple, which cannot fit the one-shot write well. How to handle the large sequential one-shot write workload? Maybe it is a point can be extended.
diff --git a/StoragePaperNote/ErasureCoding/EC-System/Giza-ATC'17.md b/StoragePaperNote/ErasureCoding/EC-System/Giza-ATC'17.md
old mode 100644
new mode 100755
index bcc1996..4832a7f
--- a/StoragePaperNote/ErasureCoding/EC-System/Giza-ATC'17.md
+++ b/StoragePaperNote/ErasureCoding/EC-System/Giza-ATC'17.md
@@ -1,61 +1,61 @@
----
-typora-copy-images-to: paper_figure
----
-# Giza: Erasure Coding Objects across Global Data Centers
-@ATC'17 @ Erasure Coding across data centers @Separate the data and metadata path
-[TOC]
-
-## Summary
-***Motivation of this paper***: This paper intends to solve the problem of reducing the cross-DC latency while maintaining the strong consistency of the metadata.
-
-***Microsoft OneDrive Characteristics***:
-- Large Objects Dominate: less than 0.9% of the total storage capacity is occupied by objects smaller than 4MB.
-- Object Teperature Drops Fast: Since the temperature of the objects drops quickly, caching objects can be very effective.
-- Writes Dominate with Caching: the cross-DC traffic is completely dominated by writes.
-- Cocurrency is rare, but versioning is required: concurrent updates of same objects are rare in Giza.
-- Deletion is Not Uncommon: removing the deleted objects from underlining cloud storage and reclaiming capacity is crucial in achieving storage efficiency.
-
-***Giza***:
-Giza is a strongly consistent versioned object store, built on top of Azure storage, and optimizes for latency and is fast in the common case. The goal of Giza:
-> 1) Giza should guarantee strong *consistency* while also minimizing operation latency.
-> 2) Giza should make full use of existing cloud infrastructure to simlipy its implementation and deployment.
-
-
-
-**Paxos using Cloud APIs**:
-Giza implements both **Paxos** and **Fast Paxos** to optimize the performance over cross-DC acceptors, and reduces the metadata path latency.
-
-| Paxos | Fast Paxos |
-| :------------------------------------------------------ | ------------------------------------------------------------ |
-| Can commit with 2 round trips | Can commit with 1 round trip |
-| Requires majority of replicas to commit $\frac{N}{2}+1$ | Requires more than the majority of replicas to commit $\frac{2N}{3}+1$ |
-
-The naive version of Giza first writes out fragments (data and parity), and then writes out metadata, resulting in two or more cross-DC round trips. (*need to reduce the latency*)
-**Giza Put Operation**:
->1. Execute metadata and data path in parallel.
->2. Upon success, return acknowledgement to the client
->3. In the background, update highest committed version number in the metadata row.
-
-**Giza Get Operation**:
->1. Optimistically read the latest commited version locally
->2. Execute both data path and metadata path in parallel:
->> a. Data path retrieves the data fragments indicated by the local latest committed version.
->> b. Metadata path will retrieve the metadata row from all data centers and validate the latest committed version.
-
-***Implementation and Evaluation***:
-Giza is implemented in C++ and uses Azure Blob and Table storage to store fragments and metadata.
-**Evaluation**:
-1. Latency of Put and Get: Fast vs Classic Paxos
-2. Latency of performance: In different setups
-3. Latency: Contention vs No Contention
-
-## Strength (Contributions of the paper)
-1. It has designed and implemented Giza, which is a strongly consisent, versioned object store that erasure codes objects across globally distributed data centers.
-2. Giza achieves minimum latency in the common case, when there are no concurrent conflicting access.
-3. Giza applie classic Paxos and Fast Paxos in a novel way on top of restricted cloud storage APIs.
-4. Giza is deployed in 11 DCs across 3 continents and experimental results demonstrate that it achieves its design goal.
-## Weakness (Limitations of the paper)
-1. This paper just consider the case of the standrad Reed-Solomon coding, can other erasure coding scheme further optimize the latency?
-
-## Future Works
-1. In stead of the standrad Reed-Solomon coding, Giza can further investigate other erasure codes to optimize its performance.
+---
+typora-copy-images-to: paper_figure
+---
+# Giza: Erasure Coding Objects across Global Data Centers
+@ATC'17 @ Erasure Coding across data centers @Separate the data and metadata path
+[TOC]
+
+## Summary
+***Motivation of this paper***: This paper intends to solve the problem of reducing the cross-DC latency while maintaining the strong consistency of the metadata.
+
+***Microsoft OneDrive Characteristics***:
+- Large Objects Dominate: less than 0.9% of the total storage capacity is occupied by objects smaller than 4MB.
+- Object Teperature Drops Fast: Since the temperature of the objects drops quickly, caching objects can be very effective.
+- Writes Dominate with Caching: the cross-DC traffic is completely dominated by writes.
+- Cocurrency is rare, but versioning is required: concurrent updates of same objects are rare in Giza.
+- Deletion is Not Uncommon: removing the deleted objects from underlining cloud storage and reclaiming capacity is crucial in achieving storage efficiency.
+
+***Giza***:
+Giza is a strongly consistent versioned object store, built on top of Azure storage, and optimizes for latency and is fast in the common case. The goal of Giza:
+> 1) Giza should guarantee strong *consistency* while also minimizing operation latency.
+> 2) Giza should make full use of existing cloud infrastructure to simlipy its implementation and deployment.
+
+
+
+**Paxos using Cloud APIs**:
+Giza implements both **Paxos** and **Fast Paxos** to optimize the performance over cross-DC acceptors, and reduces the metadata path latency.
+
+| Paxos | Fast Paxos |
+| :------------------------------------------------------ | ------------------------------------------------------------ |
+| Can commit with 2 round trips | Can commit with 1 round trip |
+| Requires majority of replicas to commit $\frac{N}{2}+1$ | Requires more than the majority of replicas to commit $\frac{2N}{3}+1$ |
+
+The naive version of Giza first writes out fragments (data and parity), and then writes out metadata, resulting in two or more cross-DC round trips. (*need to reduce the latency*)
+**Giza Put Operation**:
+>1. Execute metadata and data path in parallel.
+>2. Upon success, return acknowledgement to the client
+>3. In the background, update highest committed version number in the metadata row.
+
+**Giza Get Operation**:
+>1. Optimistically read the latest commited version locally
+>2. Execute both data path and metadata path in parallel:
+>> a. Data path retrieves the data fragments indicated by the local latest committed version.
+>> b. Metadata path will retrieve the metadata row from all data centers and validate the latest committed version.
+
+***Implementation and Evaluation***:
+Giza is implemented in C++ and uses Azure Blob and Table storage to store fragments and metadata.
+**Evaluation**:
+1. Latency of Put and Get: Fast vs Classic Paxos
+2. Latency of performance: In different setups
+3. Latency: Contention vs No Contention
+
+## Strength (Contributions of the paper)
+1. It has designed and implemented Giza, which is a strongly consisent, versioned object store that erasure codes objects across globally distributed data centers.
+2. Giza achieves minimum latency in the common case, when there are no concurrent conflicting access.
+3. Giza applie classic Paxos and Fast Paxos in a novel way on top of restricted cloud storage APIs.
+4. Giza is deployed in 11 DCs across 3 continents and experimental results demonstrate that it achieves its design goal.
+## Weakness (Limitations of the paper)
+1. This paper just consider the case of the standrad Reed-Solomon coding, can other erasure coding scheme further optimize the latency?
+
+## Future Works
+1. In stead of the standrad Reed-Solomon coding, Giza can further investigate other erasure codes to optimize its performance.
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Basics/Dimakis-TIT'09.md b/StoragePaperNote/ErasureCoding/Erasure Coding Basics/Dimakis-TIT'09.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Basics/Plank-FAST'09.md b/StoragePaperNote/ErasureCoding/Erasure Coding Basics/Plank-FAST'09.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Basics/Plank-USENIX login'13.md b/StoragePaperNote/ErasureCoding/Erasure Coding Basics/Plank-USENIX login'13.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Alpha-Entanglement-Codes-DSN'18.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Alpha-Entanglement-Codes-DSN'18.md
old mode 100644
new mode 100755
index dab8333..f7ce3d4
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Alpha-Entanglement-Codes-DSN'18.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Alpha-Entanglement-Codes-DSN'18.md
@@ -1,44 +1,44 @@
----
-typora-copy-images-to: paper_figure
----
-# Alpha Entanglement Codes: Practical Erasure Codes to Archive Data in Unreliable Environments
-@DSN'18 @ Redundancy Propagation
-[TOC]
-
-## Summary
-***Motivation of this paper***: this paper intends to design flexible and practical erasure codes with high fault-tolerance to improve data durability and availability even in catastrophic scenarios.
-
-***Alpha Entanglement Codes (AEC) $(\alpha, s, p)$***: The idea of AE code is to create redundancy by **tangling (mixing)** new data and blocks with old ones, building entangled data chains that are woven into a growing mesh of interdependent content.
->1. $\alpha$: determine the local connectivity, the number of parities created per data block.
->2. $s$ amd $p$: determin the global connectiviry of data blocks in the grid. $p$ defines the number of helical strands. $s$ the number of horizontal strands
-
-Each node belongs to $\alpha$ strands each edge belongs to only one strand.
-
-- Alpha Entanglements permit changes in the paramenters without the need to encode the content again (**dynamic** fault-tolerance, long-term storage system)
-- The encoder builds block chains, strands, that alternate data and redundant blocks.
-> The entanglement function computes the exclusive-or (XOR) of two consecutive blocks at the head of a strand and inserts the output adjacent to the last block.
-
-- The interwined graph of AEC:
-
-
-
-- The decoder repairs a node using two adjacent edges that belong to the same strand, thus, there are $\alpha$ options.
-
-***Implementation and Evaluation***:
-**Implementation**: It mentions two use cases of entangled storage system
->1. A Geo-Replicated Backup
->2. Disk Arrays
-
-**Evaluation**: It examines the design of AE codes to understand how the code settings impact on fault tolerance and write performance.
->1. Code Parameters and Write Performance
->2. Code Parameters and Fault Tolerance
-
-## Strength (Contributions of the paper)
-1. These mechanisms based on the novel redundancy propagation, which are robust and sound, but they are also efficient and easily implementable in real system.
-2. The encoder and decoder are lightweight, based on exclusive-or operations, and offer promising trade-offs between security, resource usage and performance.
-3. This paper also mentions the use cases for distributed and centralised storage systems.
-## Weakness (Limitations of the paper)
-1. This paper does not mention how to improve the efficiency of repairs. It just focuses on reducing the maximum possible storage overhead.
-
-## Future Works
+---
+typora-copy-images-to: paper_figure
+---
+# Alpha Entanglement Codes: Practical Erasure Codes to Archive Data in Unreliable Environments
+@DSN'18 @ Redundancy Propagation
+[TOC]
+
+## Summary
+***Motivation of this paper***: this paper intends to design flexible and practical erasure codes with high fault-tolerance to improve data durability and availability even in catastrophic scenarios.
+
+***Alpha Entanglement Codes (AEC) $(\alpha, s, p)$***: The idea of AE code is to create redundancy by **tangling (mixing)** new data and blocks with old ones, building entangled data chains that are woven into a growing mesh of interdependent content.
+>1. $\alpha$: determine the local connectivity, the number of parities created per data block.
+>2. $s$ amd $p$: determin the global connectiviry of data blocks in the grid. $p$ defines the number of helical strands. $s$ the number of horizontal strands
+
+Each node belongs to $\alpha$ strands each edge belongs to only one strand.
+
+- Alpha Entanglements permit changes in the paramenters without the need to encode the content again (**dynamic** fault-tolerance, long-term storage system)
+- The encoder builds block chains, strands, that alternate data and redundant blocks.
+> The entanglement function computes the exclusive-or (XOR) of two consecutive blocks at the head of a strand and inserts the output adjacent to the last block.
+
+- The interwined graph of AEC:
+
+
+
+- The decoder repairs a node using two adjacent edges that belong to the same strand, thus, there are $\alpha$ options.
+
+***Implementation and Evaluation***:
+**Implementation**: It mentions two use cases of entangled storage system
+>1. A Geo-Replicated Backup
+>2. Disk Arrays
+
+**Evaluation**: It examines the design of AE codes to understand how the code settings impact on fault tolerance and write performance.
+>1. Code Parameters and Write Performance
+>2. Code Parameters and Fault Tolerance
+
+## Strength (Contributions of the paper)
+1. These mechanisms based on the novel redundancy propagation, which are robust and sound, but they are also efficient and easily implementable in real system.
+2. The encoder and decoder are lightweight, based on exclusive-or operations, and offer promising trade-offs between security, resource usage and performance.
+3. This paper also mentions the use cases for distributed and centralised storage systems.
+## Weakness (Limitations of the paper)
+1. This paper does not mention how to improve the efficiency of repairs. It just focuses on reducing the maximum possible storage overhead.
+
+## Future Works
1. A point that can be considered is how to improve its efficiency of repairs.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/ButterflyCode-FAST'16.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/ButterflyCode-FAST'16.md
old mode 100644
new mode 100755
index d2d2074..98b39c8
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/ButterflyCode-FAST'16.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/ButterflyCode-FAST'16.md
@@ -1,41 +1,41 @@
----
-typora-copy-images-to: paper_figure
----
-# Opening the Chrysalis: On the Real Reapir Performance of MSR Codes
-@FAST'16 @MSR Code
-[TOC]
-
-## Summary
-***Motivation of this paper***: The implementation of RGCs in production systems requires dealing with complex and bug-prone algorithms. In this paper, it focuses on managing the drawbacks of RGC-MSR codes.
-
-***Butterfly Codes***:
-**Encode**: The key point in Butterfly Codes is its **recursive** construction approach, which can have a simplified design that results in a simpler implementation, and allow for a better reuse of precomputed values, leading to better cache locality.
-
-Because of the double vertical flip in equations above, the recursion can be simplified
-
-
-**Decode**: In the case of one failure, the lost column can be regenerated by communicating an amount of data equal to 1/2 of the remaining data.
-> If the lost column is not the butterfly parity, the amount of communicated data is exactly equal to the amount read from suriving disk. (**optimal I/O access**)
-
-***Implementation and Evaluation***:
-**Ceph**: RADOS uses an erasure code plug-in infrastructure that allows dynamical use of external erasure code libraries. Third-party developers can provide independent erasure code implementation.Efficient integration of Butterfly code with Ceph requires **re-defining** the plug-in interface. It also implements a **proxy plug-in** that dynamically links with existing plug-ins.
-
-**Evaluation**: In HDFS and Ceph
->1. Repair Throughput:
->2. CPU Utilization
->3. Network Traffic
->4. Storage Traffic
-
-## Strength (Contributions of the paper)
-
-1. design a recursive Butterfly code construction (two parity MSR code) and implement it in two real-world distributed storage systems (HDFS and Ceph).
-2. compare two major approaches when using erasure codes in distributed storage systems (**online** and **batch-based** encoding) and point the major tradeoffs between the two approaches.
-3. examine the performance of Butterfly code and draw a comparison between the theoretical results of MSR codes and the performance achievable in real systems.
-
-## Weakness (Limitations of the paper)
-1. Butterfly Code is not very general, it can just tolerate that any two of the codeword columns are missing
-
-## Future Work
-1. How to make Butterfly codes more general is another issue that can be extended in the future.
-
-
+---
+typora-copy-images-to: paper_figure
+---
+# Opening the Chrysalis: On the Real Reapir Performance of MSR Codes
+@FAST'16 @MSR Code
+[TOC]
+
+## Summary
+***Motivation of this paper***: The implementation of RGCs in production systems requires dealing with complex and bug-prone algorithms. In this paper, it focuses on managing the drawbacks of RGC-MSR codes.
+
+***Butterfly Codes***:
+**Encode**: The key point in Butterfly Codes is its **recursive** construction approach, which can have a simplified design that results in a simpler implementation, and allow for a better reuse of precomputed values, leading to better cache locality.
+
+Because of the double vertical flip in equations above, the recursion can be simplified
+
+
+**Decode**: In the case of one failure, the lost column can be regenerated by communicating an amount of data equal to 1/2 of the remaining data.
+> If the lost column is not the butterfly parity, the amount of communicated data is exactly equal to the amount read from suriving disk. (**optimal I/O access**)
+
+***Implementation and Evaluation***:
+**Ceph**: RADOS uses an erasure code plug-in infrastructure that allows dynamical use of external erasure code libraries. Third-party developers can provide independent erasure code implementation.Efficient integration of Butterfly code with Ceph requires **re-defining** the plug-in interface. It also implements a **proxy plug-in** that dynamically links with existing plug-ins.
+
+**Evaluation**: In HDFS and Ceph
+>1. Repair Throughput:
+>2. CPU Utilization
+>3. Network Traffic
+>4. Storage Traffic
+
+## Strength (Contributions of the paper)
+
+1. design a recursive Butterfly code construction (two parity MSR code) and implement it in two real-world distributed storage systems (HDFS and Ceph).
+2. compare two major approaches when using erasure codes in distributed storage systems (**online** and **batch-based** encoding) and point the major tradeoffs between the two approaches.
+3. examine the performance of Butterfly code and draw a comparison between the theoretical results of MSR codes and the performance achievable in real systems.
+
+## Weakness (Limitations of the paper)
+1. Butterfly Code is not very general, it can just tolerate that any two of the codeword columns are missing
+
+## Future Work
+1. How to make Butterfly codes more general is another issue that can be extended in the future.
+
+
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/DRC-ISIT'16.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/DRC-ISIT'16.md
old mode 100644
new mode 100755
index a518690..8e8a1d8
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/DRC-ISIT'16.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/DRC-ISIT'16.md
@@ -1,34 +1,34 @@
----
-typora-copy-images-to: paper_figure
----
-# Double Regenerating Codes for Hierarchical Data Centers
-@ISIT'16 @Regenerating Codes
-[TOC]
-
-## Summary
-***Motivation of this paper***: Deploying erasure coding in data center remains challenging due to the hierarchical nature of data centers (multiple racks, each comprising multiple nodes for storage). And the cross-rack bandwidth is heavily oversubscribed. Thus, this paper's goal is to minimize the cross-rack repair bandwidth in hierarchical data center.
-
-***Double Regenerating Codes***
-The core idea of DRC is to perform regeneration **twice**: first within a rack and the across multiple racks. So it calls this approach **double regeneration**.
-> repair design targets the unbalanced nature of inner-rack and cross-rack capacities in hierarchical data centers.
-
-
-- Double Regeneration makes two trade-offs:
->1. It can only tolerate a single-rack failure
->2. the sum of the inner-rack and cross-rack repair bandwidth is higher than that in MSR codes. (trade the inner-rack repair bandwidth for the cross-rack repair bandwidth)
-
-- DRC exploits node cooperation within a rack to minimize the cross-rack repair bandwidth.
-
-***Implementation and Evaluation***:
-**Implementation**
-None
-**Evaluation**
-Compared with RS, MSR, IEE
-
-## Strength (Contributions of the paper)
-1. prove that there exists a DRC construction that minimizes the cross-rack repair bandwidth for a single-node repair , while preserving the MDS property.
-2. show via quantitative comparisons that DRC reduces the cross-rack repair bandwidth of state-of-the-art MSR codes by up to 45.5%.
-## Weakness (Limitations of the paper)
-1. DRC can only tolerate a single-rack failure (as opposed to the nomal code)
-## Future Works
+---
+typora-copy-images-to: paper_figure
+---
+# Double Regenerating Codes for Hierarchical Data Centers
+@ISIT'16 @Regenerating Codes
+[TOC]
+
+## Summary
+***Motivation of this paper***: Deploying erasure coding in data center remains challenging due to the hierarchical nature of data centers (multiple racks, each comprising multiple nodes for storage). And the cross-rack bandwidth is heavily oversubscribed. Thus, this paper's goal is to minimize the cross-rack repair bandwidth in hierarchical data center.
+
+***Double Regenerating Codes***
+The core idea of DRC is to perform regeneration **twice**: first within a rack and the across multiple racks. So it calls this approach **double regeneration**.
+> repair design targets the unbalanced nature of inner-rack and cross-rack capacities in hierarchical data centers.
+
+
+- Double Regeneration makes two trade-offs:
+>1. It can only tolerate a single-rack failure
+>2. the sum of the inner-rack and cross-rack repair bandwidth is higher than that in MSR codes. (trade the inner-rack repair bandwidth for the cross-rack repair bandwidth)
+
+- DRC exploits node cooperation within a rack to minimize the cross-rack repair bandwidth.
+
+***Implementation and Evaluation***:
+**Implementation**
+None
+**Evaluation**
+Compared with RS, MSR, IEE
+
+## Strength (Contributions of the paper)
+1. prove that there exists a DRC construction that minimizes the cross-rack repair bandwidth for a single-node repair , while preserving the MDS property.
+2. show via quantitative comparisons that DRC reduces the cross-rack repair bandwidth of state-of-the-art MSR codes by up to 45.5%.
+## Weakness (Limitations of the paper)
+1. DRC can only tolerate a single-rack failure (as opposed to the nomal code)
+## Future Works
1. How about leveraging the special topological structure of rack-based data center to further reduce the repair overhead?
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Galloper-ICDCS'18.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Galloper-ICDCS'18.md
old mode 100644
new mode 100755
index 143d61d..0d085a9
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Galloper-ICDCS'18.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Galloper-ICDCS'18.md
@@ -1,46 +1,46 @@
----
-typora-copy-images-to: paper_figure
----
-# Parallelism-Aware Locally Repairable Code for Distributed Storage Systems
-@ICDCS'18 @Data Parallelism
-[TOC]
-
-## Summary
-***Motivation of this paper***: The existing designs of locally repairable codes suffer from lilited data parallelism, since original data can only be read from specific servers. In this paper, it proposes a novel family of locally repairable codes that can achieve **low disk I/O** during reconstrcution and meanwhile **extend data parallelism from specific servers to all servers**.
-
-***Galloper codes***
-- The Galloper codes focus on the low disk I/O during reconstruction and high data parallelism at the same time.
-> Main Challenge: how to spread the parity block and maintain the original properites of Pyramid codes in terms of locality and failure tolerance.
-
-
-- Galloper codes use the **symbol remapping** to achieve the "moving" the original data from the data blocks to all blocks. To this end, each block will need to contain both original data and parity data.
-
-- Given a $(k, l, g)$ Galloper code, there will be $k+l+g$ blocks in total, including $k$ data blocks, $l$ local parity blocks, and $g$ global parity blocks. Among the total $k+g+l$ blocks, it also allows to associate each block with a weight that corresponds to the performance of its server. (e.g., the throughput of sequential disk read)
-
-Compared with a $(4, 1)$ Reed-Solomon code, this Galloper code achieves the same failure tolerance, i.e., all original data can be decoded from any four blocks.
-
-- The idea to convert the Reed-Solomon code into a Galloper code is to find another set of $kN$ stripes as a new basis.
-
-**Weight Assignment**
-If the performance of a server is too much higher than the rest of the servers. It should "limit" the performance of that server. Then, it can determine the actual performance of each server by solving the following **linear programming problem**.
-
-***Implementation and Evaluation***
-**Implementation**:
-1. use Intel's storage acceleration library (ISA-L) to implement the finite field operations
-2. also implement a prototype on Apache Hadoop
-
-**Evaluation**
-comparing with Reed-Solomon codes and Pytamid codes
->1. performance of Encoding, Decoding, and Reconstruction
->2. performance of Running Hadoop Jobs
-
-Galloper codes achieve similar performance during most coding operations as existing Pyamid codes, but significantly improve the performance of running data analytical jobs on the coded data, on both homogeneous and heterogeneous servers.
-
-## Strength (Contributions of the paper)
-1. This paper proposes Galloper codes, a novel family of locally repairable codes, that achieve low disk I/O during reconstruction and meanwhile extend data parallelism from specific servers to all servers.
-2. Galloper codes can arbitrarily determine the amount of original data placed on all servers, based on the performance of the coresponding server.
-3. It also develops a prototype with Apache Hadoop.
-## Weakness (Limitations of the paper)
-1. the construction of Galloper code needs to choose a larger stripes set as the a basis, which can impose a high overhead of construction.
-## Future Works
+---
+typora-copy-images-to: paper_figure
+---
+# Parallelism-Aware Locally Repairable Code for Distributed Storage Systems
+@ICDCS'18 @Data Parallelism
+[TOC]
+
+## Summary
+***Motivation of this paper***: The existing designs of locally repairable codes suffer from lilited data parallelism, since original data can only be read from specific servers. In this paper, it proposes a novel family of locally repairable codes that can achieve **low disk I/O** during reconstrcution and meanwhile **extend data parallelism from specific servers to all servers**.
+
+***Galloper codes***
+- The Galloper codes focus on the low disk I/O during reconstruction and high data parallelism at the same time.
+> Main Challenge: how to spread the parity block and maintain the original properites of Pyramid codes in terms of locality and failure tolerance.
+
+
+- Galloper codes use the **symbol remapping** to achieve the "moving" the original data from the data blocks to all blocks. To this end, each block will need to contain both original data and parity data.
+
+- Given a $(k, l, g)$ Galloper code, there will be $k+l+g$ blocks in total, including $k$ data blocks, $l$ local parity blocks, and $g$ global parity blocks. Among the total $k+g+l$ blocks, it also allows to associate each block with a weight that corresponds to the performance of its server. (e.g., the throughput of sequential disk read)
+
+Compared with a $(4, 1)$ Reed-Solomon code, this Galloper code achieves the same failure tolerance, i.e., all original data can be decoded from any four blocks.
+
+- The idea to convert the Reed-Solomon code into a Galloper code is to find another set of $kN$ stripes as a new basis.
+
+**Weight Assignment**
+If the performance of a server is too much higher than the rest of the servers. It should "limit" the performance of that server. Then, it can determine the actual performance of each server by solving the following **linear programming problem**.
+
+***Implementation and Evaluation***
+**Implementation**:
+1. use Intel's storage acceleration library (ISA-L) to implement the finite field operations
+2. also implement a prototype on Apache Hadoop
+
+**Evaluation**
+comparing with Reed-Solomon codes and Pytamid codes
+>1. performance of Encoding, Decoding, and Reconstruction
+>2. performance of Running Hadoop Jobs
+
+Galloper codes achieve similar performance during most coding operations as existing Pyamid codes, but significantly improve the performance of running data analytical jobs on the coded data, on both homogeneous and heterogeneous servers.
+
+## Strength (Contributions of the paper)
+1. This paper proposes Galloper codes, a novel family of locally repairable codes, that achieve low disk I/O during reconstruction and meanwhile extend data parallelism from specific servers to all servers.
+2. Galloper codes can arbitrarily determine the amount of original data placed on all servers, based on the performance of the coresponding server.
+3. It also develops a prototype with Apache Hadoop.
+## Weakness (Limitations of the paper)
+1. the construction of Galloper code needs to choose a larger stripes set as the a basis, which can impose a high overhead of construction.
+## Future Works
1. How to reduce the overhead of the construction in Galloper code can be the future work.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Hitchhiker-SIGCOMM'14.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Hitchhiker-SIGCOMM'14.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/LRC-ATC'12.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/LRC-ATC'12.md
old mode 100644
new mode 100755
index 199c066..a77caa9
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/LRC-ATC'12.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/LRC-ATC'12.md
@@ -1,51 +1,51 @@
----
-typora-copy-images-to: paper_figure
----
-# Erasure Coding in Windows Azure Storage
-@ATC'12 @ Local Reconstruction Codes
-[TOC]
-
-## Summary
-***Motivation of this paper***: To provide durability for customers' data and to keep the cost of storage low, Windows Azure Storage uses erasure coding. To reduce the number of erasure coding fragments that are offline, while still keeping the storage overhead low, this paper propses Local Reconstruction Codes (LRC)
-
-The **goals** of LRC are:
-1. reduce the minimal number of fragments that need to be read from to reconstruct a data fragment.
-2. provide significant reduction in storage overhead while maintaining higher durability than a system that keeps 3 replicas for the data.
-
-***Local Reconstruction Codes***:
-To achieve these goals, it computes some of parities from a subset of the data fragments. (**Global parities** and **Local parities**)
->Global parities: are computed from **all** the data fragments
->Local parities: are computed from **a group** of data fragments
-
-
-However, LRC needs to add more parities that Reed-Solomon Code, it does the tradeoff between storage overhead and reconstruction cost. And LRC is not Maximum Distance Separable (MDS). It exists the type of **information-theoretically non-decodable**.
-
-Thus, it is also a challenge to construct a single set of **coding equations** that achieves the *Maximally Recoverable (MR)* property, or being able to decode all the information-theoretically decodable failure patterns. And this paper also discusses how to construct coding equations.
-
-For checking decodability, it also proposes an algorithm: *Firstly, check each local group, then, examine the data fragments and the global parities.*
-
-In general, the key properties of a $(k, l, r)$ LRC are: 1) single data fragment failure can be decoded from $\frac{k}{l}$ fragments; 2) arbitrary failures up to $r+1$ can be decoded.
-
-***Reliability Model and Code Selection***:
-For its reliability model, it adds a simple extension to generalize the Markov Model, in order to capture the unique state transitions in LRC. (because failure mode not only depends on the size of node fails, but also which subset of nodes fails.).
-This paper also plots the trade-off curve between the **storage overhead** and **reconstruction cost**, and compares it with RS code and Modern codes.
-
-***Implementation and Evaluation***
-**Implementation:** Its choice is to implement erasure coding inside the stream layer. It is based on the fact that it fits the overall WAS architecture where the stream layer is responsible for keeping the data durable within a stamp.
-
-**Evaluation:** It compare the LRC's performance with RS code for **small I/Os** and **large I/Os**, respectively.
->1. small I/Os: latency and the number of I/Os taken by the requests;
->2. large I/Os: latency and bandwidth consumption
->3. decoding latency
-
-## Strength (Contributions of the paper)
-This paper proposes the Local Reconstruction Codes which can 1) reduce the minimal number of fragments that need to be read from to reconstruct a data fragment and 2) provide significant reduction in storage overhead while maintaining higher durability than a system that keep 3 replicas for the data.
-
-## Weakness (Limitations of the paper)
-1. LRC is not a MDS code, it can not tolerant all the failure, it needs to identify whether the failure mode is recoverable.
-2. In its reliability model, it not mentions how to deal with correlated failures in detail.
-3. To construct coding equation for achieving the maximum recoverable property, it also is a challengeable issue.
-
-## Future Work
-For the first weakness, it can also further consider how to improve the fault tolerance better.
-
+---
+typora-copy-images-to: paper_figure
+---
+# Erasure Coding in Windows Azure Storage
+@ATC'12 @ Local Reconstruction Codes
+[TOC]
+
+## Summary
+***Motivation of this paper***: To provide durability for customers' data and to keep the cost of storage low, Windows Azure Storage uses erasure coding. To reduce the number of erasure coding fragments that are offline, while still keeping the storage overhead low, this paper propses Local Reconstruction Codes (LRC)
+
+The **goals** of LRC are:
+1. reduce the minimal number of fragments that need to be read from to reconstruct a data fragment.
+2. provide significant reduction in storage overhead while maintaining higher durability than a system that keeps 3 replicas for the data.
+
+***Local Reconstruction Codes***:
+To achieve these goals, it computes some of parities from a subset of the data fragments. (**Global parities** and **Local parities**)
+>Global parities: are computed from **all** the data fragments
+>Local parities: are computed from **a group** of data fragments
+
+
+However, LRC needs to add more parities that Reed-Solomon Code, it does the tradeoff between storage overhead and reconstruction cost. And LRC is not Maximum Distance Separable (MDS). It exists the type of **information-theoretically non-decodable**.
+
+Thus, it is also a challenge to construct a single set of **coding equations** that achieves the *Maximally Recoverable (MR)* property, or being able to decode all the information-theoretically decodable failure patterns. And this paper also discusses how to construct coding equations.
+
+For checking decodability, it also proposes an algorithm: *Firstly, check each local group, then, examine the data fragments and the global parities.*
+
+In general, the key properties of a $(k, l, r)$ LRC are: 1) single data fragment failure can be decoded from $\frac{k}{l}$ fragments; 2) arbitrary failures up to $r+1$ can be decoded.
+
+***Reliability Model and Code Selection***:
+For its reliability model, it adds a simple extension to generalize the Markov Model, in order to capture the unique state transitions in LRC. (because failure mode not only depends on the size of node fails, but also which subset of nodes fails.).
+This paper also plots the trade-off curve between the **storage overhead** and **reconstruction cost**, and compares it with RS code and Modern codes.
+
+***Implementation and Evaluation***
+**Implementation:** Its choice is to implement erasure coding inside the stream layer. It is based on the fact that it fits the overall WAS architecture where the stream layer is responsible for keeping the data durable within a stamp.
+
+**Evaluation:** It compare the LRC's performance with RS code for **small I/Os** and **large I/Os**, respectively.
+>1. small I/Os: latency and the number of I/Os taken by the requests;
+>2. large I/Os: latency and bandwidth consumption
+>3. decoding latency
+
+## Strength (Contributions of the paper)
+This paper proposes the Local Reconstruction Codes which can 1) reduce the minimal number of fragments that need to be read from to reconstruct a data fragment and 2) provide significant reduction in storage overhead while maintaining higher durability than a system that keep 3 replicas for the data.
+
+## Weakness (Limitations of the paper)
+1. LRC is not a MDS code, it can not tolerant all the failure, it needs to identify whether the failure mode is recoverable.
+2. In its reliability model, it not mentions how to deal with correlated failures in detail.
+3. To construct coding equation for achieving the maximum recoverable property, it also is a challengeable issue.
+
+## Future Work
+For the first weakness, it can also further consider how to improve the fault tolerance better.
+
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/NCCloud-FAST'12.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/NCCloud-FAST'12.md
old mode 100644
new mode 100755
index abc7122..6e33874
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/NCCloud-FAST'12.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/NCCloud-FAST'12.md
@@ -1,46 +1,46 @@
----
-typora-copy-images-to: paper_figure
----
-# NCCloud: A Network-Coding-Based Storage System in a Cloud-of Clouds
-@FAST'12 @ Regenerating Code
-[TOC]
-
-## Summary
-***Motivation of this paper***: It is important to do the repair in multiple cloud storage. This paper's objective is to minimize the cost of storage repair (due to the maintain of data over the clouds) for a permanent sing-cloud failure.
-
-
-***Functional Minimum-Storage Regenerating Code (FMSR)***:
-This paper present a **proxy-based**, multiple-cloud storage system, called NCCloud, which can practically address the reliability of today's cloud backup storage. NCCloud implements FMSR, which eliminates the encoding requirement if storage nodes during repair, while ensuring the new set of stored chunks after each round of repair preserves the required fault tolerance.
-
-Code chunk $P_i$ equals to linear combination of original data chunks. For the repair in FMSR, it downloads one code chunk from each surviving node and reconstructs new code chunks (via random linear combination) in new node.
-However, FMSR codes is not systematic, FMSR codes are acceptable for **long-term** archival applications, where the read frequency is typically low.
-
-**Iterative Repairs**:
-FMSR codes regenerates different chunks in each repair, so it is necessary to ensure **MDS property** still holds even after iterative repairs. To achieve this, it also proposes a **two-phase** checking method.
->1. **MDS property check**: current repair maintains MDS property.
->2. **Repair MDS property check**: next repair for any possible failure maintains MDS property.
-
-
-***Implementation and Evaluation***:
-It implements NCCloud as a proxy that bridges user applications and multiple clouds including three layers:
->1. file system layer
->2. coding layer (both RAID-6 and FMSR)
->3. storage layer
-
-**Evaluation**:
-Response time: both in Local Cloud (OpenStack Swift) and Commercial Cloud (multiple containers in Azure) with different file sizes.
-
-## Strength (Contributions of the paper)
-1. design FMSR code that can be deployed in thin-cloud setting as they do require storage nodes to perform encoding during repair.
-2. implement FMSR code with a two-phase checking scheme, which ensures that double-fault tolerance is maintained in the current and next round of repair.
-3. conduct monetary cost analysis and extensive experiments on both local cloud and commercial cloud settings.
-
-## Weakness (Limitations of the paper)
-1. FMSR code is not a systematic code that it only stores encoded chunks formed by the linear combination of the original data chunks, and does not keep the original data. Thus, the overhead of frequent reading is high.
-2. In this paper, it just considers an FMSR code implementation with double-fault tolerance.
-
-## Future Work
-1. consider how to reduce the encoding overhead of FMSR code
-2. make the FMSR code more general, beyond double-fault tolerance
-
-
+---
+typora-copy-images-to: paper_figure
+---
+# NCCloud: A Network-Coding-Based Storage System in a Cloud-of Clouds
+@FAST'12 @ Regenerating Code
+[TOC]
+
+## Summary
+***Motivation of this paper***: It is important to do the repair in multiple cloud storage. This paper's objective is to minimize the cost of storage repair (due to the maintain of data over the clouds) for a permanent sing-cloud failure.
+
+
+***Functional Minimum-Storage Regenerating Code (FMSR)***:
+This paper present a **proxy-based**, multiple-cloud storage system, called NCCloud, which can practically address the reliability of today's cloud backup storage. NCCloud implements FMSR, which eliminates the encoding requirement if storage nodes during repair, while ensuring the new set of stored chunks after each round of repair preserves the required fault tolerance.
+
+Code chunk $P_i$ equals to linear combination of original data chunks. For the repair in FMSR, it downloads one code chunk from each surviving node and reconstructs new code chunks (via random linear combination) in new node.
+However, FMSR codes is not systematic, FMSR codes are acceptable for **long-term** archival applications, where the read frequency is typically low.
+
+**Iterative Repairs**:
+FMSR codes regenerates different chunks in each repair, so it is necessary to ensure **MDS property** still holds even after iterative repairs. To achieve this, it also proposes a **two-phase** checking method.
+>1. **MDS property check**: current repair maintains MDS property.
+>2. **Repair MDS property check**: next repair for any possible failure maintains MDS property.
+
+
+***Implementation and Evaluation***:
+It implements NCCloud as a proxy that bridges user applications and multiple clouds including three layers:
+>1. file system layer
+>2. coding layer (both RAID-6 and FMSR)
+>3. storage layer
+
+**Evaluation**:
+Response time: both in Local Cloud (OpenStack Swift) and Commercial Cloud (multiple containers in Azure) with different file sizes.
+
+## Strength (Contributions of the paper)
+1. design FMSR code that can be deployed in thin-cloud setting as they do require storage nodes to perform encoding during repair.
+2. implement FMSR code with a two-phase checking scheme, which ensures that double-fault tolerance is maintained in the current and next round of repair.
+3. conduct monetary cost analysis and extensive experiments on both local cloud and commercial cloud settings.
+
+## Weakness (Limitations of the paper)
+1. FMSR code is not a systematic code that it only stores encoded chunks formed by the linear combination of the original data chunks, and does not keep the original data. Thus, the overhead of frequent reading is high.
+2. In this paper, it just considers an FMSR code implementation with double-fault tolerance.
+
+## Future Work
+1. consider how to reduce the encoding overhead of FMSR code
+2. make the FMSR code more general, beyond double-fault tolerance
+
+
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Pyramid Codes-ToS'13.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Pyramid Codes-ToS'13.md
old mode 100644
new mode 100755
index 35f008d..580e20e
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Pyramid Codes-ToS'13.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/Pyramid Codes-ToS'13.md
@@ -1,61 +1,61 @@
----
-typora-copy-images-to: paper_figure
----
-# Pyramid Codes: Flexible Schemes to Trade Space for Access Efficiency in Reliable Data Storage Systems
-@ToS'13 @Erasure Code
-[TOC]
-
-## Summary
-***Motivation of this paper***: Replication and MDS-based ERC are regarded as two extremes of the tradeoffs between storage space and access efficiency. This paper intends to find the middle ground.
-
-***Method***
-- Four key metrics concerning any ERC scheme:
- 1) Storage overhead
- 2) Fault Tolerance (unrecoverable probability)
- 3) Access Efficiency (the expected number of blocks required to serve an unavailable data block)
- 4) Update Complexity
-
-
-**Basic Pyramid Codes**:
-
-It divides the six data blocks into two equal size groups $S_1 = \{d_1, d_2, d_3\}$, $S_2=\{d_4,d_5,d_6\}$. Then computer one redundant block for each group, denoted as $c_{1,1}$ for $S_1$ and $c_{1,2}$ for $S_2$. $c_{1,1}$ and $c_{1,2}$ are the **group redundant blocks**. $c_2$ and $c_3$ are **global redundant blocks**. **The Pyramid Codes are not MDS**.
-$$
-c_{1,1}=\sum_{i=1}^{3} \alpha_{i,1} d_i
-$$
-$$
-c_{1,2}=\sum_{i=4}^{6} \alpha_{i,1} d_i
-$$
-
-- **Conclusion**: The BPC uses more storage space than the MDS code, but it gains in recpnstruction read cost and fault tolerance, while maintaining the same update complexity.
-(**higher unrecoverable probability**)
-
-**Recpverability Limit**:
-It is possible to improve fault tolerance without affecting the remaining three metrics (storage overhead, update complexity, access efficiency)
-Two concepts in ERC scheme:
->1. coding dependency: defines which set of data blocks is used to compute each individual redundant block. (a bit vector) (**determines storage overhead, reconstruction read cost, and update complexity**)
->2. coding equations: defines how the set of data blocks compute each redundant block, or the coding coefficients for computing the redundant block.(**only affect fault tolerance**)
-
-- The fault tolerance can be improved by modifying the coding equations. Thus it is possible to improve the fault tolerance without affecting storage overhead, access efficiency, and update complexity.
-> achieved by modifying only coding equations while keeping coding dependency untouched.
-
-**Maximally Recoverable Property**: An erasure-resilient coding scheme is said to hold the maximally recoverable (MR) property under predeterminded coding dependency, if all failure cases satisfying the matching condition are reocverable.
-
-- To investigate the recoverability limit of a given predetermined coding dependency, it establishes a necessary condition, called the **matching condition** to characterize the limit of recoverability for the any failure case.
-> Give a Theorem: a failure case is said to satisfy the matching condition whenever the corresponding **reduced decoding Tanner graph** contains a full-size matching
->
-> 
-
-**Generalized Pyramid Codes**
-GPC holds the maximally recoverable property. Compare with BPC, GPC also accomodates more flexible coding dependency.
-
-
-
-## Strength (Contributions of the paper)
-1. It designs two new ERC schemes: basic pytamid codes (BPC) and generalized pyramid codes (GPC). Both schemes require slightly more storage space than MDS-based ERC, but significantly improve the performance of reconstruction.
-2. It establishes a necessary matching condition to characterize the limit of failure recovery, that is, unless the matching condition is satisfied, a failure case is impossible to recover.
-3. In addition, it also defines a maximally recoverable (MR) property. For all ERC schemes holding the MR property, the matching condition becomes sufficient.
-## Weakness (Limitations of the paper)
-1. For Pyramid code, the improvment of access efficiency comes at the cost of extra storage overhead.
-2. The construction of GPC is very complex, and it needs to take an iterative approach which poses a high construction overhead.
-## Future Works
-1. How to reduce the construction overhead of the GPC?
+---
+typora-copy-images-to: paper_figure
+---
+# Pyramid Codes: Flexible Schemes to Trade Space for Access Efficiency in Reliable Data Storage Systems
+@ToS'13 @Erasure Code
+[TOC]
+
+## Summary
+***Motivation of this paper***: Replication and MDS-based ERC are regarded as two extremes of the tradeoffs between storage space and access efficiency. This paper intends to find the middle ground.
+
+***Method***
+- Four key metrics concerning any ERC scheme:
+ 1) Storage overhead
+ 2) Fault Tolerance (unrecoverable probability)
+ 3) Access Efficiency (the expected number of blocks required to serve an unavailable data block)
+ 4) Update Complexity
+
+
+**Basic Pyramid Codes**:
+
+It divides the six data blocks into two equal size groups $S_1 = \{d_1, d_2, d_3\}$, $S_2=\{d_4,d_5,d_6\}$. Then computer one redundant block for each group, denoted as $c_{1,1}$ for $S_1$ and $c_{1,2}$ for $S_2$. $c_{1,1}$ and $c_{1,2}$ are the **group redundant blocks**. $c_2$ and $c_3$ are **global redundant blocks**. **The Pyramid Codes are not MDS**.
+$$
+c_{1,1}=\sum_{i=1}^{3} \alpha_{i,1} d_i
+$$
+$$
+c_{1,2}=\sum_{i=4}^{6} \alpha_{i,1} d_i
+$$
+
+- **Conclusion**: The BPC uses more storage space than the MDS code, but it gains in recpnstruction read cost and fault tolerance, while maintaining the same update complexity.
+(**higher unrecoverable probability**)
+
+**Recpverability Limit**:
+It is possible to improve fault tolerance without affecting the remaining three metrics (storage overhead, update complexity, access efficiency)
+Two concepts in ERC scheme:
+>1. coding dependency: defines which set of data blocks is used to compute each individual redundant block. (a bit vector) (**determines storage overhead, reconstruction read cost, and update complexity**)
+>2. coding equations: defines how the set of data blocks compute each redundant block, or the coding coefficients for computing the redundant block.(**only affect fault tolerance**)
+
+- The fault tolerance can be improved by modifying the coding equations. Thus it is possible to improve the fault tolerance without affecting storage overhead, access efficiency, and update complexity.
+> achieved by modifying only coding equations while keeping coding dependency untouched.
+
+**Maximally Recoverable Property**: An erasure-resilient coding scheme is said to hold the maximally recoverable (MR) property under predeterminded coding dependency, if all failure cases satisfying the matching condition are reocverable.
+
+- To investigate the recoverability limit of a given predetermined coding dependency, it establishes a necessary condition, called the **matching condition** to characterize the limit of recoverability for the any failure case.
+> Give a Theorem: a failure case is said to satisfy the matching condition whenever the corresponding **reduced decoding Tanner graph** contains a full-size matching
+>
+> 
+
+**Generalized Pyramid Codes**
+GPC holds the maximally recoverable property. Compare with BPC, GPC also accomodates more flexible coding dependency.
+
+
+
+## Strength (Contributions of the paper)
+1. It designs two new ERC schemes: basic pytamid codes (BPC) and generalized pyramid codes (GPC). Both schemes require slightly more storage space than MDS-based ERC, but significantly improve the performance of reconstruction.
+2. It establishes a necessary matching condition to characterize the limit of failure recovery, that is, unless the matching condition is satisfied, a failure case is impossible to recover.
+3. In addition, it also defines a maximally recoverable (MR) property. For all ERC schemes holding the MR property, the matching condition becomes sufficient.
+## Weakness (Limitations of the paper)
+1. For Pyramid code, the improvment of access efficiency comes at the cost of extra storage overhead.
+2. The construction of GPC is very complex, and it needs to take an iterative approach which poses a high construction overhead.
+## Future Works
+1. How to reduce the construction overhead of the GPC?
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/XORing-PVLDB'13.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/XORing-PVLDB'13.md
old mode 100644
new mode 100755
index cb79b68..a3a1d8e
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/XORing-PVLDB'13.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/XORing-PVLDB'13.md
@@ -1,36 +1,36 @@
----
-typora-copy-images-to: paper_figure
----
-# XORing Elephants: Novel Erasure Codes for Big Data
-@PVLDB'13 @New Erasure Code @Repair problem
-[TOC]
-
-## Summary
-***Motivation of this paper***: This paper wants to overcome the high repair cost of RS code. It presents a novel family of erasure codes that are efficiently repairable and offer higher reliability compared to Reed-Solomon codes.
-
-***Locally Repairable Codes (LRC)***:
-- The main idea of this code is to sacrifice storage efficiency to gain well performance. And it is inspired by "MDS code with parameters $(k, n-k)$ cannot have locality smaller than $k$". Because it is exactly the cost of **optimal fault tolerance**. Thus, it tries to construct a **near-MDS** code with non-trivial locality.
-- LRCs are constructed on the top of MDS codes: MDS encoded blocks are grouped in logarithmic sized sets and then are combined together to obtain parity blocks or **logarithmic** degree. For the distance:
-$$
-\lim\limits_{k \rightarrow \infty} \frac{d_{LRC}}{d_{MDS}}=1
-$$
-It makes repair efficient by adding additional **local parities**.
-
-
-***Implementation and Evaluation***:
-Its system is a modification of **HDFS-RAID** that incorporates LRC. The **RaidNode** and **BlockFixer** classes were also subject to modifications.
-**Evaluation**: both in Amazon's Elastic Compute Cloud (EC2) and a test cluster in Facebook
->1. HDFS Bytes Read: total amount of data read
->2. Network Traffic: total amount of data communicated from nodes (Amazon's AWS Cloudwatch monitoring tools)
->3. Repair Duration: the time interval between the starting time of the first repair job and the ending time of the last repair job.
-
-## Strength (Contributions of the paper)
-1. introduce a now family of erasure code called **Locally Repairable Codes** (LRCs), which are efficiently repairable both in terms of network bandwidth and disk I/O.
-2. present both randomized and explicit LRC constructions starting from generalized Reed-Solomon parities.
-3. design and implement a module that replaces Reed-Solomon codes with LRCs in HDFS-RAID.
-4. The part of **impartance of repair** is well-written
-## Weakness (Limitations of the paper)
-1. The disadvantage of adding these local parities is the extra storage requirement
-## Future Work
-1. For the part of Reliability Analysis, this paper cannot solve the issue fo measuring the availability tradeoffs of coded storage systems. Thus, this can be a future research direction.
-
+---
+typora-copy-images-to: paper_figure
+---
+# XORing Elephants: Novel Erasure Codes for Big Data
+@PVLDB'13 @New Erasure Code @Repair problem
+[TOC]
+
+## Summary
+***Motivation of this paper***: This paper wants to overcome the high repair cost of RS code. It presents a novel family of erasure codes that are efficiently repairable and offer higher reliability compared to Reed-Solomon codes.
+
+***Locally Repairable Codes (LRC)***:
+- The main idea of this code is to sacrifice storage efficiency to gain well performance. And it is inspired by "MDS code with parameters $(k, n-k)$ cannot have locality smaller than $k$". Because it is exactly the cost of **optimal fault tolerance**. Thus, it tries to construct a **near-MDS** code with non-trivial locality.
+- LRCs are constructed on the top of MDS codes: MDS encoded blocks are grouped in logarithmic sized sets and then are combined together to obtain parity blocks or **logarithmic** degree. For the distance:
+$$
+\lim\limits_{k \rightarrow \infty} \frac{d_{LRC}}{d_{MDS}}=1
+$$
+It makes repair efficient by adding additional **local parities**.
+
+
+***Implementation and Evaluation***:
+Its system is a modification of **HDFS-RAID** that incorporates LRC. The **RaidNode** and **BlockFixer** classes were also subject to modifications.
+**Evaluation**: both in Amazon's Elastic Compute Cloud (EC2) and a test cluster in Facebook
+>1. HDFS Bytes Read: total amount of data read
+>2. Network Traffic: total amount of data communicated from nodes (Amazon's AWS Cloudwatch monitoring tools)
+>3. Repair Duration: the time interval between the starting time of the first repair job and the ending time of the last repair job.
+
+## Strength (Contributions of the paper)
+1. introduce a now family of erasure code called **Locally Repairable Codes** (LRCs), which are efficiently repairable both in terms of network bandwidth and disk I/O.
+2. present both randomized and explicit LRC constructions starting from generalized Reed-Solomon parities.
+3. design and implement a module that replaces Reed-Solomon codes with LRCs in HDFS-RAID.
+4. The part of **impartance of repair** is well-written
+## Weakness (Limitations of the paper)
+1. The disadvantage of adding these local parities is the extra storage requirement
+## Future Work
+1. For the part of Reliability Analysis, this paper cannot solve the issue fo measuring the availability tradeoffs of coded storage systems. Thus, this can be a future research direction.
+
diff --git a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/product-matrix-MSR-FAST'15.md b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/product-matrix-MSR-FAST'15.md
old mode 100644
new mode 100755
index 5ccb5c5..4871582
--- a/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/product-matrix-MSR-FAST'15.md
+++ b/StoragePaperNote/ErasureCoding/Erasure Coding Deployment/product-matrix-MSR-FAST'15.md
@@ -1,48 +1,48 @@
----
-typora-copy-images-to: paper_figure
----
-# Having Your Cake and Eating It Too: Jointly Optimal Erasure Codes for I/O, Storage and Network-bandwidth
-@FAST'15 @MSR
-[TOC]
-
-## Summary
-***Motivation of this paper***: MSR codes are optimal with respect to storage and network transfers. However, MSR codes do not optimize with respect to I/Os. In general, its I/O overhead is higher than that in a system employing RS code. And with the increasing speeds of newer generation network interconnects, I/O is becoming the **primary bottleneck** in the performance of the storage system. This paper wants to design erasure codes that are simultaneously optimal in terms of I/O, storage, and network bandwidth during reconstructions.
-
-***Optimizing I/O during reconstruction***: In this paper, it proposes two algorithms to **transform MSR codes into codes that are I/O efficient as well**.
-- Algorithm 1: transforms to minimizes I/O cost **locally** at each helper block
-- Algorithm 2: builds on top of Algorithm 1 to minimize I/O cost **globally** across all blocks
-For Algorithm 1, it wants to let the I/O consumed equal to Network Bandwidth consumed by using **Reconstruct-by-transfer (RBT)**.
-
-The rational behind this algorithm are two properties in MSR code:
->1. Independence between helpers: Function computed at a helper is not dependent on which other blocks are helping.
->> Thus, a block computes **pre-determined functions** to aid in reconstruction of each of the other blocks.
->2. Independence between functions computed at helper block
->> 
->> Under MSR, a block does minimum I/O when helping in RBT-fashion
-
-For Algorithm 2, it focus on choosing RBT-helper assignment and minimize I/O cost globally across all blocks. And Algorithm 1 takes the assignment of "who acts as RBT helper to whom" as input. In its model, it uses two parameters $\delta$ and $p$
-> $\delta$: the relative importance between systematic and parity blocks.
-> $p$: $0 \leq p \leq 1$, aim to capture the fact that: when the reconstruction of a block is to be performed, every other block may individually be unavailable with **a probability $p$ independent of all other blocks.**
-
-The expected reconstruction cost for any block, under a given number of RBT-helper blocks, can be easily computed using parameter $p$. And then it can compute number of RBT-helpers for each block and select the RBT-helper for each block. Two extreme cases:(***Maybe this part can be improved***)
-> Complete preferential treatment for data blocks (**Systematic** pattern): Each block RBT-helps data blocks
-> Equality for all: no preferential treatment (**Cyclic** pattern): Each block RBT-helps following blocks
-
-***Implementation and Evaluation***: It uses **Jerasure** and **GF-Complete** libraries for finite-field arithmetic operations. In its evaluations, it mainly focus on:
->1. Data transfers across the Network
->2. Data read and number of I/O
->3. I/O completion time during reconstruction.
->4. Decoding and encoding performance
->5. RBT-helper assignment algorithm
-
-## Strength (Contributions of the paper)
-1. This paper tries to solve the I/O problem in MSR, so that it is jointly optimal erasure codes for I/O, Storage, and Network-bandwidth. It proposes algorithm to transform MSR codes
-2. Implemented and evaluated application onto Product-Matrix MSR codes
-3. Analytical results on optimality.
-
-## Weakness (Limitations of the paper)
-1. This paper just uses the Product-Matrix MSR codes to do the transformation. How about other MSR codes? So it also needs to prove the generality.
-2. For its global optimization model, I think it is a little simple because it just includes two parameters.
-
-## Future Work
+---
+typora-copy-images-to: paper_figure
+---
+# Having Your Cake and Eating It Too: Jointly Optimal Erasure Codes for I/O, Storage and Network-bandwidth
+@FAST'15 @MSR
+[TOC]
+
+## Summary
+***Motivation of this paper***: MSR codes are optimal with respect to storage and network transfers. However, MSR codes do not optimize with respect to I/Os. In general, its I/O overhead is higher than that in a system employing RS code. And with the increasing speeds of newer generation network interconnects, I/O is becoming the **primary bottleneck** in the performance of the storage system. This paper wants to design erasure codes that are simultaneously optimal in terms of I/O, storage, and network bandwidth during reconstructions.
+
+***Optimizing I/O during reconstruction***: In this paper, it proposes two algorithms to **transform MSR codes into codes that are I/O efficient as well**.
+- Algorithm 1: transforms to minimizes I/O cost **locally** at each helper block
+- Algorithm 2: builds on top of Algorithm 1 to minimize I/O cost **globally** across all blocks
+For Algorithm 1, it wants to let the I/O consumed equal to Network Bandwidth consumed by using **Reconstruct-by-transfer (RBT)**.
+
+The rational behind this algorithm are two properties in MSR code:
+>1. Independence between helpers: Function computed at a helper is not dependent on which other blocks are helping.
+>> Thus, a block computes **pre-determined functions** to aid in reconstruction of each of the other blocks.
+>2. Independence between functions computed at helper block
+>> 
+>> Under MSR, a block does minimum I/O when helping in RBT-fashion
+
+For Algorithm 2, it focus on choosing RBT-helper assignment and minimize I/O cost globally across all blocks. And Algorithm 1 takes the assignment of "who acts as RBT helper to whom" as input. In its model, it uses two parameters $\delta$ and $p$
+> $\delta$: the relative importance between systematic and parity blocks.
+> $p$: $0 \leq p \leq 1$, aim to capture the fact that: when the reconstruction of a block is to be performed, every other block may individually be unavailable with **a probability $p$ independent of all other blocks.**
+
+The expected reconstruction cost for any block, under a given number of RBT-helper blocks, can be easily computed using parameter $p$. And then it can compute number of RBT-helpers for each block and select the RBT-helper for each block. Two extreme cases:(***Maybe this part can be improved***)
+> Complete preferential treatment for data blocks (**Systematic** pattern): Each block RBT-helps data blocks
+> Equality for all: no preferential treatment (**Cyclic** pattern): Each block RBT-helps following blocks
+
+***Implementation and Evaluation***: It uses **Jerasure** and **GF-Complete** libraries for finite-field arithmetic operations. In its evaluations, it mainly focus on:
+>1. Data transfers across the Network
+>2. Data read and number of I/O
+>3. I/O completion time during reconstruction.
+>4. Decoding and encoding performance
+>5. RBT-helper assignment algorithm
+
+## Strength (Contributions of the paper)
+1. This paper tries to solve the I/O problem in MSR, so that it is jointly optimal erasure codes for I/O, Storage, and Network-bandwidth. It proposes algorithm to transform MSR codes
+2. Implemented and evaluated application onto Product-Matrix MSR codes
+3. Analytical results on optimality.
+
+## Weakness (Limitations of the paper)
+1. This paper just uses the Product-Matrix MSR codes to do the transformation. How about other MSR codes? So it also needs to prove the generality.
+2. For its global optimization model, I think it is a little simple because it just includes two parameters.
+
+## Future Work
1. I think the global optimization model is a good point for extension and the helper assignment is a common problem in the EC storage system.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/JunLi's paper/Beehive-HotStorage'15.md b/StoragePaperNote/ErasureCoding/JunLi's paper/Beehive-HotStorage'15.md
old mode 100644
new mode 100755
index ff5dcba..1137c28
--- a/StoragePaperNote/ErasureCoding/JunLi's paper/Beehive-HotStorage'15.md
+++ b/StoragePaperNote/ErasureCoding/JunLi's paper/Beehive-HotStorage'15.md
@@ -1,38 +1,38 @@
----
-typora-copy-images-to: paper_figure
----
-# Beehive: Erasure Codes for Fixing Multiple Failures in Distributed Storage Systems
-@HotStorage'15 @Multiple Failure @MSR code
-[TOC]
-
-## Summary
-***Motivation of this paper***: Generally, distributed storage systems will reconstruct multiple failure blocks separately. However, data unavailability events can be **correlated**, specifically, many disks fail at similar ages. To leveage the advantage of the correlated failures, this paper wants to reconstruct multiple missing blocks in batches, to save both network transfer and disk I/O during reconstruction.
-
-***Beehive***:
-- An instant benefit of Beehive is each block will only be **read once** to reconstruct multiple blocks.
-
-- The construction of Beehive codes is built on top of product-matrix MSR codes, based on one particular *product-matrix* construction proposed by Rashmi.
-> 1. MSR codes constructed are systematic
-> 2. Unlike other constructions that impose constraints on specific values of $d$ or $k$, the construction proposed in product-matrix is much more general by only requiring $d \geq 2k-2$
-
-**Encode**:
-It divides one generation of the original data into two parts that contains $k$ and $k-1$ blocks. In first part each block contains $d-k+1$ segments. In second par each block contains $t-1$ segments.
-From the original data, Beehive computes $n$ blocks. each block contains the $d-k+1$ segments from $g_iF$ and the $t-1$ segments from $\sum_{l=1}^{k-1}a_{i,l}c_l$
-
-
-**Decode**:
-- The helpers computes the decoding information and sends it to newcomer $j$.
-- At the side of newcomers, it can divide their operation into two stages. In the first stage, it will receive $d$ segments from helpers and do the calculation. In the second stage, it will send the calculation result to another newcomer $j^{'}$, and it will receive $t-1$ segments from other newcomers as well.
-- During reconstruction, each newcomer will receive $d+t-1$ segments, achieving the optimal network transfer to reconstruct $t$ blocks.
-
-
-***Implementation and Evaluation***:
-Implement Beehive in C++ by using the Intel storage acceleration library (ISA-L) for the finite field arithmetic.
-## Strength (Contributions of the paper)
-1. this paper propose a new family of erasure codes that can reconstruct multiple blocks at same time. And each block will only be read once to reconstrcut multiple blocks.
-## Weakness (Limitations of the paper)
-1. Need additional storage overhead
-2. Its encoding process is based on MSR code, so Beehive's encoding operation is bit slower than MSR codes
-## Future Works
-1. A very improtant issue I consider is how to combine Beehive with practical distributed storage system.
-
+---
+typora-copy-images-to: paper_figure
+---
+# Beehive: Erasure Codes for Fixing Multiple Failures in Distributed Storage Systems
+@HotStorage'15 @Multiple Failure @MSR code
+[TOC]
+
+## Summary
+***Motivation of this paper***: Generally, distributed storage systems will reconstruct multiple failure blocks separately. However, data unavailability events can be **correlated**, specifically, many disks fail at similar ages. To leveage the advantage of the correlated failures, this paper wants to reconstruct multiple missing blocks in batches, to save both network transfer and disk I/O during reconstruction.
+
+***Beehive***:
+- An instant benefit of Beehive is each block will only be **read once** to reconstruct multiple blocks.
+
+- The construction of Beehive codes is built on top of product-matrix MSR codes, based on one particular *product-matrix* construction proposed by Rashmi.
+> 1. MSR codes constructed are systematic
+> 2. Unlike other constructions that impose constraints on specific values of $d$ or $k$, the construction proposed in product-matrix is much more general by only requiring $d \geq 2k-2$
+
+**Encode**:
+It divides one generation of the original data into two parts that contains $k$ and $k-1$ blocks. In first part each block contains $d-k+1$ segments. In second par each block contains $t-1$ segments.
+From the original data, Beehive computes $n$ blocks. each block contains the $d-k+1$ segments from $g_iF$ and the $t-1$ segments from $\sum_{l=1}^{k-1}a_{i,l}c_l$
+
+
+**Decode**:
+- The helpers computes the decoding information and sends it to newcomer $j$.
+- At the side of newcomers, it can divide their operation into two stages. In the first stage, it will receive $d$ segments from helpers and do the calculation. In the second stage, it will send the calculation result to another newcomer $j^{'}$, and it will receive $t-1$ segments from other newcomers as well.
+- During reconstruction, each newcomer will receive $d+t-1$ segments, achieving the optimal network transfer to reconstruct $t$ blocks.
+
+
+***Implementation and Evaluation***:
+Implement Beehive in C++ by using the Intel storage acceleration library (ISA-L) for the finite field arithmetic.
+## Strength (Contributions of the paper)
+1. this paper propose a new family of erasure codes that can reconstruct multiple blocks at same time. And each block will only be read once to reconstrcut multiple blocks.
+## Weakness (Limitations of the paper)
+1. Need additional storage overhead
+2. Its encoding process is based on MSR code, so Beehive's encoding operation is bit slower than MSR codes
+## Future Works
+1. A very improtant issue I consider is how to combine Beehive with practical distributed storage system.
+
diff --git a/StoragePaperNote/ErasureCoding/JunLi's paper/Carousel-ICDCS'17.md b/StoragePaperNote/ErasureCoding/JunLi's paper/Carousel-ICDCS'17.md
old mode 100644
new mode 100755
index 15068c4..b7875cd
--- a/StoragePaperNote/ErasureCoding/JunLi's paper/Carousel-ICDCS'17.md
+++ b/StoragePaperNote/ErasureCoding/JunLi's paper/Carousel-ICDCS'17.md
@@ -1,38 +1,38 @@
----
-typora-copy-images-to: paper_figure
----
-# On Data Parallelism of Erasure Coding in Distributed Storage Systems
-@ICDCS'17 @Data Parallelism
-[TOC]
-
-## Summary
-***Motivation of this paper***: Data paralleism, which refers to the number of blocks that can be read by different processes simultaneously, is limited by existing **systematic** erasure code. This paper is designed to extend data parallelism from reading $k$ data blocks in parallel to reading all $n$ blocks. Therefore, it can have a higher overall throughput.
-
-
-***Carousel Codes***:
-- Carousel codes achieve a flexible trade-off between **data parallelism** and **data availability**.
-The process of the construction is shown below:
-
-- How to achieve flexible parallelism: This paper also allows users to flexibly specify the degree of data parallelism by controlling the number of blocks that contains the original data.
-
-
-***Implementation and Evaluation***:
-This paper implements the Carousel Code in C++. All operations, including encoding, decoding, and reconstruction are implemented by using ISA-L.
-- The advantage of the **sparsity** of the generating matrix in Carousel codes
-> Even though the size of the generating matrix is expaned the complexity of encoding and output bit does not change.
-
-**Evaluation**
->1. The comparison of the encoding and decoding throughput for various values of $k$.
->2. Completion time of reconstruction operations for various values of $k$.
->3. Comparison of Hadoop jobs running on data encoded with systematic RS codes and Carousel codes. (terasort and wordcount in Hadoop)
->4. Comparsion of the time of retrieving a 3GB file from HDFS with systematic RS code and Carousel codes
-
-
-## Strength (Contributions of the paper)
-1. This paper presents **Carousel Codes**, which has a higher data parallelism $\rightarrow$ a higher overall throughput.
-2. It also shows the besides the MDS property and the configurable data parallelism, Carousel codes can achieve such optimal network transfer during reconstruction as well.
-3. It also implemented Carousel codes in C++ and developed its prototype in Apache Hadoop.
-## Weakness (Limitations of the paper)
-1. I think a important issue of this code is the overhead of reconstruction, including the data re-distribution and re-encoding, which is much higher than the systematic RS code.
-## Future Works
-1. For the first weakness, I consider how to decrease this kind of overhead is a potential research direction.
+---
+typora-copy-images-to: paper_figure
+---
+# On Data Parallelism of Erasure Coding in Distributed Storage Systems
+@ICDCS'17 @Data Parallelism
+[TOC]
+
+## Summary
+***Motivation of this paper***: Data paralleism, which refers to the number of blocks that can be read by different processes simultaneously, is limited by existing **systematic** erasure code. This paper is designed to extend data parallelism from reading $k$ data blocks in parallel to reading all $n$ blocks. Therefore, it can have a higher overall throughput.
+
+
+***Carousel Codes***:
+- Carousel codes achieve a flexible trade-off between **data parallelism** and **data availability**.
+The process of the construction is shown below:
+
+- How to achieve flexible parallelism: This paper also allows users to flexibly specify the degree of data parallelism by controlling the number of blocks that contains the original data.
+
+
+***Implementation and Evaluation***:
+This paper implements the Carousel Code in C++. All operations, including encoding, decoding, and reconstruction are implemented by using ISA-L.
+- The advantage of the **sparsity** of the generating matrix in Carousel codes
+> Even though the size of the generating matrix is expaned the complexity of encoding and output bit does not change.
+
+**Evaluation**
+>1. The comparison of the encoding and decoding throughput for various values of $k$.
+>2. Completion time of reconstruction operations for various values of $k$.
+>3. Comparison of Hadoop jobs running on data encoded with systematic RS codes and Carousel codes. (terasort and wordcount in Hadoop)
+>4. Comparsion of the time of retrieving a 3GB file from HDFS with systematic RS code and Carousel codes
+
+
+## Strength (Contributions of the paper)
+1. This paper presents **Carousel Codes**, which has a higher data parallelism $\rightarrow$ a higher overall throughput.
+2. It also shows the besides the MDS property and the configurable data parallelism, Carousel codes can achieve such optimal network transfer during reconstruction as well.
+3. It also implemented Carousel codes in C++ and developed its prototype in Apache Hadoop.
+## Weakness (Limitations of the paper)
+1. I think a important issue of this code is the overhead of reconstruction, including the data re-distribution and re-encoding, which is much higher than the systematic RS code.
+## Future Works
+1. For the first weakness, I consider how to decrease this kind of overhead is a potential research direction.
diff --git a/StoragePaperNote/ErasureCoding/JunLi's paper/JunLi-INFOCOM'14.md b/StoragePaperNote/ErasureCoding/JunLi's paper/JunLi-INFOCOM'14.md
old mode 100644
new mode 100755
index cbf117e..b3d427e
--- a/StoragePaperNote/ErasureCoding/JunLi's paper/JunLi-INFOCOM'14.md
+++ b/StoragePaperNote/ErasureCoding/JunLi's paper/JunLi-INFOCOM'14.md
@@ -1,24 +1,24 @@
----
-typora-copy-images-to: paper_figure
----
-# Cooperative Pipelined Regeneration in Distributed Storage Systems
-@INFOCOM'14 @Cooperative Regeneration
-[TOC]
-
-## Summary
-***Motivation of this paper***: In regenerating codes, it needs to engage a large number of nodes during regeneration, which is not desirable in practical system. Although, some techniques have been proposed to reduce the number of participating nodes during regeneration, they either fail to maintain the recoverability property, or are not designed for regenerating **multiple data losses**. Thus, this paper proposes a cooperative pipelined regeneration process to regenerate multiple data looses in batches.
-
-***Cooperative Pipelined Regeneration***:
-1. Newcomers recevive data from other participating nodes and encode received data into partially regenerated coded blocks.
-2. After this round of regeneration, newcomers will be partially regenerated and become **apprentices**.
-3. In the next round of cooperative pipelined regeneration, apprentices and $r$ new newcomers receive data from other participating nodes with another set of $v$ providers.
-
-
-## Strength (Contributions of the paper)
-1. This paper illustrates the pipelined regeneration with both **Random Linear Codes** and **Regenerating Codes**.
-## Weakness (Limitations of the paper)
-1. The apprentices introbduce additional storage overhead in this code
-2. This paper just gives the theroical analysis of this code, not includes the implementation
-3. It needs to do the experiments in a practical distribution storage system.
-## Future Works
-1. This work is really theorical, I think it should discuss more on how to implement it in a practical distribution system.
+---
+typora-copy-images-to: paper_figure
+---
+# Cooperative Pipelined Regeneration in Distributed Storage Systems
+@INFOCOM'14 @Cooperative Regeneration
+[TOC]
+
+## Summary
+***Motivation of this paper***: In regenerating codes, it needs to engage a large number of nodes during regeneration, which is not desirable in practical system. Although, some techniques have been proposed to reduce the number of participating nodes during regeneration, they either fail to maintain the recoverability property, or are not designed for regenerating **multiple data losses**. Thus, this paper proposes a cooperative pipelined regeneration process to regenerate multiple data looses in batches.
+
+***Cooperative Pipelined Regeneration***:
+1. Newcomers recevive data from other participating nodes and encode received data into partially regenerated coded blocks.
+2. After this round of regeneration, newcomers will be partially regenerated and become **apprentices**.
+3. In the next round of cooperative pipelined regeneration, apprentices and $r$ new newcomers receive data from other participating nodes with another set of $v$ providers.
+
+
+## Strength (Contributions of the paper)
+1. This paper illustrates the pipelined regeneration with both **Random Linear Codes** and **Regenerating Codes**.
+## Weakness (Limitations of the paper)
+1. The apprentices introbduce additional storage overhead in this code
+2. This paper just gives the theroical analysis of this code, not includes the implementation
+3. It needs to do the experiments in a practical distribution storage system.
+## Future Works
+1. This work is really theorical, I think it should discuss more on how to implement it in a practical distribution system.
diff --git a/StoragePaperNote/ErasureCoding/JunLi's paper/Pipelined_Regeneration-NetCod'11.md b/StoragePaperNote/ErasureCoding/JunLi's paper/Pipelined_Regeneration-NetCod'11.md
old mode 100644
new mode 100755
index 8170953..e4530da
--- a/StoragePaperNote/ErasureCoding/JunLi's paper/Pipelined_Regeneration-NetCod'11.md
+++ b/StoragePaperNote/ErasureCoding/JunLi's paper/Pipelined_Regeneration-NetCod'11.md
@@ -1,24 +1,24 @@
----
-typora-copy-images-to: paper_figure
----
-# Pipelined Regeneration with Regenerating Codes for Distributed Storage Systems
-@NetCod'11 @Regenerating Code
-[TOC]
-
-## Summary
-***Motivation of this paper***: In regenerating codes, minimum-storage regenerating codes require the newcomer to receive $\frac{M}{k} \times \frac{d}{d-k+1}$. However, in practical distributed storage system, it is not favorable to let a large number of nodes work cooperatively. This paper proposes a pipelined regeneration process that can reduce the number of participating nodes required by regenerating codes **without sacrificing data integrity**.
-
-***Pipelined Regeneration***:
-The *independent regeneration* and *cooperative regeneration*
-
-In some practical distributed storage system, the regeneration process will not be triggered until the number of failed storage node has reached a **certain threshold** and they can be regenerated in batches. (many newcomers)
-
-This paper engages much fewer participating nodes by dividing one round of cooperative regeneration into several rounds of pipelined regeneration. In each round of pipelined regeneration, both the newcomer and apprentices are partially regenerated, such that there are fewer than $d$ providers. Suppose there are one newcomer, $\alpha$ apprentices, $v$ providers, during pipelined regeneration, all providers send $\alpha+1$ ($\alpha$ apprentices and one newcomer). A newcomer will be fully regenerated after $\alpha+1$ rounds of pipelined regeneration.
-
-## Strength (Contributions of the paper)
-1. This paper proposes the pipelined regeneration process, which can consume a smaller amount of traffic that would have been achieved by the conventional regeneration process with a much higher number of participating nodes.
-## Weakness (Limitations of the paper)
-1. It just consider the case of single failure and fails to utilize multiple newcomers cooperatively, and can only support MSR codes.
-2. I feel the implementation of this code maybe very hard. So how to combine it with a practical distributed storage system is a issue.
-## Future Works
-1. How to utilize multiple newcomers in cooperatice fashion, such that it can further saving the bandwidth consumption
+---
+typora-copy-images-to: paper_figure
+---
+# Pipelined Regeneration with Regenerating Codes for Distributed Storage Systems
+@NetCod'11 @Regenerating Code
+[TOC]
+
+## Summary
+***Motivation of this paper***: In regenerating codes, minimum-storage regenerating codes require the newcomer to receive $\frac{M}{k} \times \frac{d}{d-k+1}$. However, in practical distributed storage system, it is not favorable to let a large number of nodes work cooperatively. This paper proposes a pipelined regeneration process that can reduce the number of participating nodes required by regenerating codes **without sacrificing data integrity**.
+
+***Pipelined Regeneration***:
+The *independent regeneration* and *cooperative regeneration*
+
+In some practical distributed storage system, the regeneration process will not be triggered until the number of failed storage node has reached a **certain threshold** and they can be regenerated in batches. (many newcomers)
+
+This paper engages much fewer participating nodes by dividing one round of cooperative regeneration into several rounds of pipelined regeneration. In each round of pipelined regeneration, both the newcomer and apprentices are partially regenerated, such that there are fewer than $d$ providers. Suppose there are one newcomer, $\alpha$ apprentices, $v$ providers, during pipelined regeneration, all providers send $\alpha+1$ ($\alpha$ apprentices and one newcomer). A newcomer will be fully regenerated after $\alpha+1$ rounds of pipelined regeneration.
+
+## Strength (Contributions of the paper)
+1. This paper proposes the pipelined regeneration process, which can consume a smaller amount of traffic that would have been achieved by the conventional regeneration process with a much higher number of participating nodes.
+## Weakness (Limitations of the paper)
+1. It just consider the case of single failure and fails to utilize multiple newcomers cooperatively, and can only support MSR codes.
+2. I feel the implementation of this code maybe very hard. So how to combine it with a practical distributed storage system is a issue.
+## Future Works
+1. How to utilize multiple newcomers in cooperatice fashion, such that it can further saving the bandwidth consumption
diff --git a/StoragePaperNote/ErasureCoding/JunLi's paper/Zebra-IWQoS'16.md b/StoragePaperNote/ErasureCoding/JunLi's paper/Zebra-IWQoS'16.md
old mode 100644
new mode 100755
index 564caa9..3005d16
--- a/StoragePaperNote/ErasureCoding/JunLi's paper/Zebra-IWQoS'16.md
+++ b/StoragePaperNote/ErasureCoding/JunLi's paper/Zebra-IWQoS'16.md
@@ -1,42 +1,42 @@
----
-typora-copy-images-to: paper_figure
----
-# Zebra: Demand-aware Erasure Coding for Distributed Storage Systems
-@IWQoS'16 @Heterogeneous Erasure Codes @How to use the skewness of demand in distributed
-
-[TOC]
-
-## Summary
-***Motivation of this paper***: For the erasure code $(k, r)$, the value of $k$ can affect both CPU overhead and Network overhead. In other word, a smaller value of $k$ means less reconstruction overhead, leading to less network transfer for data reconstruction as well as lower latency for degraded reads. $k$ can do the tradeoff between storage overhead and reconstruction overhead. (Normally, a very small portion of data are highly demanded while the rest are barely touched). This paper wants to achieve that **hot data can be reconstructed with low overhead and cold data can also enjoy low storage overhad at same time**.
-
-***Zebra Framework***:
-- Goal: dynamically assign data into multiple tiers by their demand so as to achieve a **flexible tradeoff** between storage and reconstruction overhead.
-- In its model, the definition of the overall reconstruction overhead is $\sum_{i=1}^{N}d_ik_i=D \times K$ (k is the parameter in $RS(k,r)$, $d_i$ is the demand of visiting per unit of time). Thus, it intends to solve $K$ to minimize the overall reconstruction overhead with respect to the storage overhead. (**integer geometric programming problem**)
-- To reduce the complexity of this optimization problem, it assumes all blocks of the same file should have the **same or similar** demand in a distributed storage system, and files with similar demand will be merged into same one. This can further refine the model.
-- It finally encodes blocks in each tier with a $(k_i, r)$ RS code, where $k_i$ is the rank of the file and $r$ is the number of failures to tolerate.
-
-**Deploying Zebra with Online Demand**:
-- To work well with the demand in the future, the demand in the mode will not be purely determined by the most recent demand, but a **linear combination** of demand over a longer period of time: $D_0$ is the most recent demand, $D$ is the demand in last interval.
-$$
-(1-\alpha)D+\alpha D_0
-$$
-- To reduce the overhead of data migration, it leverages an advantage of Cauchy RS code that **any submatrix of a Cauchy matrix is still a Cauchy matrix**. Because of this property, it can easily downgrade or upgrade data between an $(mk, r)$ and a $(k, r)$, $m \in Z$
-
-
-***Implementation and Evaluation***:
-This paper does not mention the detail of implementation. It just states that it does the simulation in the workload of Facebook.
-**Evaluation:**
->1. the average of the reconstruction overhead per visit
->2. the average reconstruction overhead with various values of $\alpha$ (to describe the tradeoff the consistency and the transiency of the demand)
->3. the ratio of migration traffic to demand traffic
-
-## Strength (Contributions of the paper)
-1. All current systems require users to configure the erasure code used in each tier as well as their parameters **statically**. Zebra can configure itself **flexibly**, where only the overall stroage overhead and failure tolerance need to be manually specified.
-2. This paper also consider how to make this framework more practical by using **online demand**, and solve the issue of how to reduce the data migration when the configuration of Erasure Code changes by using the feature of **Cauchy Matrix**.
-## Weakness (Limitations of the paper)
-1. To reduce the network traffic overhead of data migration, it leverages the feature of Cauchy Matrix, and designs some constraints for the parameters. This issue makes it not very general.
-2. The model of the online demand is very simple, it is just the linear combination of consistency and trnasiency of the demand.
-3. This paper lacks the details of how to implement Zebra framework in a practical system.
-## Future Works
-1. For the first weakness, how to make the parameter selection more flexible is a potential direction.
-
+---
+typora-copy-images-to: paper_figure
+---
+# Zebra: Demand-aware Erasure Coding for Distributed Storage Systems
+@IWQoS'16 @Heterogeneous Erasure Codes @How to use the skewness of demand in distributed
+
+[TOC]
+
+## Summary
+***Motivation of this paper***: For the erasure code $(k, r)$, the value of $k$ can affect both CPU overhead and Network overhead. In other word, a smaller value of $k$ means less reconstruction overhead, leading to less network transfer for data reconstruction as well as lower latency for degraded reads. $k$ can do the tradeoff between storage overhead and reconstruction overhead. (Normally, a very small portion of data are highly demanded while the rest are barely touched). This paper wants to achieve that **hot data can be reconstructed with low overhead and cold data can also enjoy low storage overhad at same time**.
+
+***Zebra Framework***:
+- Goal: dynamically assign data into multiple tiers by their demand so as to achieve a **flexible tradeoff** between storage and reconstruction overhead.
+- In its model, the definition of the overall reconstruction overhead is $\sum_{i=1}^{N}d_ik_i=D \times K$ (k is the parameter in $RS(k,r)$, $d_i$ is the demand of visiting per unit of time). Thus, it intends to solve $K$ to minimize the overall reconstruction overhead with respect to the storage overhead. (**integer geometric programming problem**)
+- To reduce the complexity of this optimization problem, it assumes all blocks of the same file should have the **same or similar** demand in a distributed storage system, and files with similar demand will be merged into same one. This can further refine the model.
+- It finally encodes blocks in each tier with a $(k_i, r)$ RS code, where $k_i$ is the rank of the file and $r$ is the number of failures to tolerate.
+
+**Deploying Zebra with Online Demand**:
+- To work well with the demand in the future, the demand in the mode will not be purely determined by the most recent demand, but a **linear combination** of demand over a longer period of time: $D_0$ is the most recent demand, $D$ is the demand in last interval.
+$$
+(1-\alpha)D+\alpha D_0
+$$
+- To reduce the overhead of data migration, it leverages an advantage of Cauchy RS code that **any submatrix of a Cauchy matrix is still a Cauchy matrix**. Because of this property, it can easily downgrade or upgrade data between an $(mk, r)$ and a $(k, r)$, $m \in Z$
+
+
+***Implementation and Evaluation***:
+This paper does not mention the detail of implementation. It just states that it does the simulation in the workload of Facebook.
+**Evaluation:**
+>1. the average of the reconstruction overhead per visit
+>2. the average reconstruction overhead with various values of $\alpha$ (to describe the tradeoff the consistency and the transiency of the demand)
+>3. the ratio of migration traffic to demand traffic
+
+## Strength (Contributions of the paper)
+1. All current systems require users to configure the erasure code used in each tier as well as their parameters **statically**. Zebra can configure itself **flexibly**, where only the overall stroage overhead and failure tolerance need to be manually specified.
+2. This paper also consider how to make this framework more practical by using **online demand**, and solve the issue of how to reduce the data migration when the configuration of Erasure Code changes by using the feature of **Cauchy Matrix**.
+## Weakness (Limitations of the paper)
+1. To reduce the network traffic overhead of data migration, it leverages the feature of Cauchy Matrix, and designs some constraints for the parameters. This issue makes it not very general.
+2. The model of the online demand is very simple, it is just the linear combination of consistency and trnasiency of the demand.
+3. This paper lacks the details of how to implement Zebra framework in a practical system.
+## Future Works
+1. For the first weakness, how to make the parameter selection more flexible is a potential direction.
+
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CAR-DSN'16.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CAR-DSN'16.md
old mode 100644
new mode 100755
index edfc420..6fe0b15
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CAR-DSN'16.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CAR-DSN'16.md
@@ -1,52 +1,52 @@
----
-typora-copy-images-to: paper_figure
----
-# Reconsidering Single Failure Recovery in Clustered File Systems
-@DSN'16 @Single Failure
-[TOC]
-
-## Summary
-***Motivation of this paper***:
-1. existing studies on single failure recovery neglect the **bandwidth diversity propert** in CFS archiecture (intra-rack and cross-rack).
-2. Many single failure recovery solutions focus on XOR-based erasure-codes, which are not commonly used for maintaining fault tolerance in a CFS.
-3. Existing single failure recovery solutions focus on minimizing the amount of repair traffic, but most of them do not consider the load balancing issue during the recovery operation.
-*To this end, this paper aims to reduce and balance the amount of cross-rack repair traffic for a single failure tolerance.*
-
-***Cross-rack-aware Recovery (CAR)***
-Three key techniques:
-- 1. CAR examines the data layout and finds a recovery solution in which the resulting reapir traffic comes from the minimum number of racks.
-
-This method is not very complex, just needs to the layout.
-
-- 2. CAR performs intra-rack aggregation for the retrieved chunks in each rack before transmitting them across racks in order to minimize the amount of cross-rack repair traffic.
-After finding the minimum number of intact racks to be accessed for recovery, it also performs intra-rack chunk aggregation for the retrieved chunks in the same rack. (**partial decoding**)
-> Due to the linearity, suppose the first $j$ requested chunks $\{H_1^{'},..., H_j^{'}\}$ are stored in the same rack, so it can specify a node in that rack to perform the linear operations based on the $\sum_{i=1}^{'}y_{i}H_{i}^{'}$
-
-- 3. CAR examines the per-stripe recovery solutions across multiple stripes, and constructs a multi-stripe recovery solution that balances the amount of cross-rack repair traffic across multiple racks.
-
-To describe the load balance rate $\lambda$ of the cross-rack repair traffic, it defines it as follows:
-$$
-\lambda = \frac{max\{t_{i,j}\}}{\sum{\frac{t_{i, j}}{r-1}}}
-$$
-The ratio of the maximum amount of cross-rack repair traffic across each rack to the average amount of cross-rack repair traffic over the **intact** racks.So it formulates this question into an optimization problem.
-> Goal: minimize the load balancing rate, subject to the condition that the total amount of cross-rack repair. (Minimize $\lambda$, subject to $\sum{t_{i,f}}$ is minimized)
-
-The main idea of it is to replace the currently selected multi-stripe recovery solution with another one that introduces a smaller load balancing rate $\lambda$
-
-
-
-***Implementation and Evaluation***:
-**Evaluation**:
-1. Cross-Rack Repair Traffic: evaluate the amount of cross-rack repair traffic when recovering a single lost chunk.
-2. Load Balancing: measure the laod balancing rate (i.e, $\lambda$)
-3. Computation Time and Transmission Time
-## Strength (Contributions of the paper)
-1. this paper identifies the open issues that are not addressed by existing studies on single failure recovery.
-2. It proposes CAR, a new cross-rack-aware single failure recovery algorithm for a CFS setting.
-3. It also implements CAR and conduct extensive testbed experiments based on different CFS settings with up to 20 nodes.
-## Weakness (Limitations of the paper)
-1. Firstly, this paper does not provider the details of how to implement this recovery scheme in to a proactical system, e.g., how to achieve the partial decoding?
-2. The idea of this paper is not vey novel and easy to understand. I think the performance of this scheme highly depends on the layout.
-## Future Works
-1. A very serious issue is how to decrease the overhead of the partial decoding in the internal of the a rack.
+---
+typora-copy-images-to: paper_figure
+---
+# Reconsidering Single Failure Recovery in Clustered File Systems
+@DSN'16 @Single Failure
+[TOC]
+
+## Summary
+***Motivation of this paper***:
+1. existing studies on single failure recovery neglect the **bandwidth diversity propert** in CFS archiecture (intra-rack and cross-rack).
+2. Many single failure recovery solutions focus on XOR-based erasure-codes, which are not commonly used for maintaining fault tolerance in a CFS.
+3. Existing single failure recovery solutions focus on minimizing the amount of repair traffic, but most of them do not consider the load balancing issue during the recovery operation.
+*To this end, this paper aims to reduce and balance the amount of cross-rack repair traffic for a single failure tolerance.*
+
+***Cross-rack-aware Recovery (CAR)***
+Three key techniques:
+- 1. CAR examines the data layout and finds a recovery solution in which the resulting reapir traffic comes from the minimum number of racks.
+
+This method is not very complex, just needs to the layout.
+
+- 2. CAR performs intra-rack aggregation for the retrieved chunks in each rack before transmitting them across racks in order to minimize the amount of cross-rack repair traffic.
+After finding the minimum number of intact racks to be accessed for recovery, it also performs intra-rack chunk aggregation for the retrieved chunks in the same rack. (**partial decoding**)
+> Due to the linearity, suppose the first $j$ requested chunks $\{H_1^{'},..., H_j^{'}\}$ are stored in the same rack, so it can specify a node in that rack to perform the linear operations based on the $\sum_{i=1}^{'}y_{i}H_{i}^{'}$
+
+- 3. CAR examines the per-stripe recovery solutions across multiple stripes, and constructs a multi-stripe recovery solution that balances the amount of cross-rack repair traffic across multiple racks.
+
+To describe the load balance rate $\lambda$ of the cross-rack repair traffic, it defines it as follows:
+$$
+\lambda = \frac{max\{t_{i,j}\}}{\sum{\frac{t_{i, j}}{r-1}}}
+$$
+The ratio of the maximum amount of cross-rack repair traffic across each rack to the average amount of cross-rack repair traffic over the **intact** racks.So it formulates this question into an optimization problem.
+> Goal: minimize the load balancing rate, subject to the condition that the total amount of cross-rack repair. (Minimize $\lambda$, subject to $\sum{t_{i,f}}$ is minimized)
+
+The main idea of it is to replace the currently selected multi-stripe recovery solution with another one that introduces a smaller load balancing rate $\lambda$
+
+
+
+***Implementation and Evaluation***:
+**Evaluation**:
+1. Cross-Rack Repair Traffic: evaluate the amount of cross-rack repair traffic when recovering a single lost chunk.
+2. Load Balancing: measure the laod balancing rate (i.e, $\lambda$)
+3. Computation Time and Transmission Time
+## Strength (Contributions of the paper)
+1. this paper identifies the open issues that are not addressed by existing studies on single failure recovery.
+2. It proposes CAR, a new cross-rack-aware single failure recovery algorithm for a CFS setting.
+3. It also implements CAR and conduct extensive testbed experiments based on different CFS settings with up to 20 nodes.
+## Weakness (Limitations of the paper)
+1. Firstly, this paper does not provider the details of how to implement this recovery scheme in to a proactical system, e.g., how to achieve the partial decoding?
+2. The idea of this paper is not vey novel and easy to understand. I think the performance of this scheme highly depends on the layout.
+## Future Works
+1. A very serious issue is how to decrease the overhead of the partial decoding in the internal of the a rack.
2. For a specific layout, this scheme may lead to the skewed workload for a rack.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CORE-TC'15.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CORE-TC'15.md
old mode 100644
new mode 100755
index 85bd931..45ebe95
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CORE-TC'15.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/CORE-TC'15.md
@@ -1,50 +1,50 @@
----
-typora-copy-images-to: paper_figure
-
----
-
-## Enabling Concurrent Failure Recovery for Regenerating-Coding-Based Storage System: From Theory to Practice
-@MSST'13 @TC-2015 @Categroy
-[TOC]
-## 1. Summary
-### Motivation of this paper:
-This paper argues node failures are often **correlated and co-occurring** in practice. Thus, it is very necessary to minimizing the bandwidth for concurrent failure recovery, and it can also provide additional benefits. (e.g., delaying recovery).
-
-This paper wants to investigate how to enable regenerating code to recover concurrent failures, while retains existing optimal regenerating code constructions and the underlying regenerating coded data.
-
-### CORE
-CORE is built on existing MSR code. It preserves the optimal storage efficiency of MSR. In its design, it does not fix the number $t$ of failure nodes.
-
-#### Baseline Approach
-In its baseline approach, the main idea is to divide packets into two categories: *real packets* and *virtual packets*
-> 1. To recover each of the $t$ failed nodes, the relayer still operates as if it connects to $n-1$ nodes. It treats the packets downloaded from the failed nodes as virtual packets.
-> 2. It computes each virtual packet as a **function** of the downloaded real packets.
-> 3. It can compose $t(t-1)$ equations based on the above idea, however, it needs to guarantee the **linearly independency** between each equations. And finally it can recover multiple loss packets by solving the equation system.
-> 
-> 
-
-#### For any Failure Pattern
-
-In the baseline method, it needs the failure pattern to statisfy the constraints of linearly independency which cannot work in some failure patterns. So it also extend the baseline approach to deal with the case that cannot statsify linearly independency.
-> Solution: include other surviving nodes to form a virtual failure pattern until it can statisfy the linearly independency (achieve the good failure pattern).
-> 
-
-### Implementation and Evaluation
-- It implements it with HDFS-RAID. For further speedup, it also applies a pipeline mode to leverages the multiple threads to parallel the encoding/decoding operations.
-- Evaluation
-> 20 Nodes, Compare with RS
-> Decoding throughput, striping throughput, recovery throughput, degarded read throughput, Map-Reduce runtime with node failures.
-
-## 2. Strength (Contributions of the paper)
-- CORE just adds a new recovery scheme atop existing regenerating codes, it guarantees the generality.
-- The implementation of CORE is not very complex, its relayer architecture can support regenerating code easily. It should be easy to follow.
-- This paper also integrates CORE in HDFS to test its practical performance.
-- In this paper, it derives the experiment to investigate the bottleneck during the recovery operation. It observes zthe download step is the bottleneck which can support the need of minimizing recovery bandwidth.
-
-## 3. Weakness (Limitations of the paper)
-- It just considers the cases of Interference Alignment (IA) codes and Product Matrix (PM) codes. How about other kinds of regenerating codes? I think it should clarify this point.
-- This paper does not provide a way to identify the bad failure pattern.
-
-## 4. Future Works
-- Due to the fact that this paper does not provide the rationale behind bad failure pattern to cause the linearly dependency, I think one thing that can be done is to provide a genernal way to identify the bad failure pattern.
+---
+typora-copy-images-to: paper_figure
+
+---
+
+## Enabling Concurrent Failure Recovery for Regenerating-Coding-Based Storage System: From Theory to Practice
+@MSST'13 @TC-2015 @Categroy
+[TOC]
+## 1. Summary
+### Motivation of this paper:
+This paper argues node failures are often **correlated and co-occurring** in practice. Thus, it is very necessary to minimizing the bandwidth for concurrent failure recovery, and it can also provide additional benefits. (e.g., delaying recovery).
+
+This paper wants to investigate how to enable regenerating code to recover concurrent failures, while retains existing optimal regenerating code constructions and the underlying regenerating coded data.
+
+### CORE
+CORE is built on existing MSR code. It preserves the optimal storage efficiency of MSR. In its design, it does not fix the number $t$ of failure nodes.
+
+#### Baseline Approach
+In its baseline approach, the main idea is to divide packets into two categories: *real packets* and *virtual packets*
+> 1. To recover each of the $t$ failed nodes, the relayer still operates as if it connects to $n-1$ nodes. It treats the packets downloaded from the failed nodes as virtual packets.
+> 2. It computes each virtual packet as a **function** of the downloaded real packets.
+> 3. It can compose $t(t-1)$ equations based on the above idea, however, it needs to guarantee the **linearly independency** between each equations. And finally it can recover multiple loss packets by solving the equation system.
+> 
+> 
+
+#### For any Failure Pattern
+
+In the baseline method, it needs the failure pattern to statisfy the constraints of linearly independency which cannot work in some failure patterns. So it also extend the baseline approach to deal with the case that cannot statsify linearly independency.
+> Solution: include other surviving nodes to form a virtual failure pattern until it can statisfy the linearly independency (achieve the good failure pattern).
+> 
+
+### Implementation and Evaluation
+- It implements it with HDFS-RAID. For further speedup, it also applies a pipeline mode to leverages the multiple threads to parallel the encoding/decoding operations.
+- Evaluation
+> 20 Nodes, Compare with RS
+> Decoding throughput, striping throughput, recovery throughput, degarded read throughput, Map-Reduce runtime with node failures.
+
+## 2. Strength (Contributions of the paper)
+- CORE just adds a new recovery scheme atop existing regenerating codes, it guarantees the generality.
+- The implementation of CORE is not very complex, its relayer architecture can support regenerating code easily. It should be easy to follow.
+- This paper also integrates CORE in HDFS to test its practical performance.
+- In this paper, it derives the experiment to investigate the bottleneck during the recovery operation. It observes zthe download step is the bottleneck which can support the need of minimizing recovery bandwidth.
+
+## 3. Weakness (Limitations of the paper)
+- It just considers the cases of Interference Alignment (IA) codes and Product Matrix (PM) codes. How about other kinds of regenerating codes? I think it should clarify this point.
+- This paper does not provide a way to identify the bad failure pattern.
+
+## 4. Future Works
+- Due to the fact that this paper does not provide the rationale behind bad failure pattern to cause the linearly dependency, I think one thing that can be done is to provide a genernal way to identify the bad failure pattern.
- In this paper, it fixes the quantitative relationship between parameter $(k,n)$ of regenerating codes, one thing can be done is try to investigate how the those parameters affect CORE. Can it reduce more redundancy?
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Clay Codes-FAST'18.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Clay Codes-FAST'18.md
old mode 100644
new mode 100755
index 8428e2a..8012796
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Clay Codes-FAST'18.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Clay Codes-FAST'18.md
@@ -1,47 +1,47 @@
----
-typora-root-url: paper_figure
-
----
-
-# Clay Codes: Moulding MDS Codes to Yield an MSR Code
-@FAST'18 @erasure code @Ceph @SMSR Codes
-[TOC]
-
-## Summary
-
-***Motivation of this paper***: There is a strong need for erasure codes that can efficiently recovery from single-code. However, the conventional repair of an RS code is inefficient. This paper will deal with Minimum Storage Regeneration (MSR) codes, which has least possible repair bandwidth and three additional properties. It proposes the Coupled-layer (Clay) Code by extending the theoretical construction of Ye and Brag with practical considerations.
-
-> 1. minimal disk read
-> 2. minimize sub-packetization level $\alpha$
-> 3. small field size, low-complexity implementation
-
-***Clay Code Construction***: Here is the example with a (4, 2) scalar RS Code.
-
-
-
-
-***Decoding***: For decoding, it introduces the notion of an Intersection Score (IS) to indicate the number of the vertices that correspond to erased bytes and which are at the same time colored red. Decoding is carried out sequentially, layer-by-layer, in order of increasing IS.
-
-For example:
-
-
-
-***Implementation and Evaluation***: This work introduce the notion of **sub-chunking** to enable use of **vector erasure codes** with Ceph. For implementations of various MDS codes and Galois-field arithmetic, it uses **Jerasure** and **GF-Complete** libraries.
-In its experiments, it does evaluations in Amazon EC2. Those experiments are carried out on both **fixed** and **variable object-size** workloads. Measurements are made of:
-
-> 1. repair network traffic
-> 2. repair disk read
-> 3. repair time
-> 4. encoding time
-> 5. I/O performance for degraded, normal operations.
-
-## Strength (Contributions of the paper)
-1. introduce the construction of Clay codes
-2. do the modification of Ceph to support any vector code
-3. the integration of Clay codes as a plugin to Ceph
-
-## Weakness (Limitations of the paper)
-
-
-## Future Work
-
+---
+typora-root-url: paper_figure
+
+---
+
+# Clay Codes: Moulding MDS Codes to Yield an MSR Code
+@FAST'18 @erasure code @Ceph @SMSR Codes
+[TOC]
+
+## Summary
+
+***Motivation of this paper***: There is a strong need for erasure codes that can efficiently recovery from single-code. However, the conventional repair of an RS code is inefficient. This paper will deal with Minimum Storage Regeneration (MSR) codes, which has least possible repair bandwidth and three additional properties. It proposes the Coupled-layer (Clay) Code by extending the theoretical construction of Ye and Brag with practical considerations.
+
+> 1. minimal disk read
+> 2. minimize sub-packetization level $\alpha$
+> 3. small field size, low-complexity implementation
+
+***Clay Code Construction***: Here is the example with a (4, 2) scalar RS Code.
+
+
+
+
+***Decoding***: For decoding, it introduces the notion of an Intersection Score (IS) to indicate the number of the vertices that correspond to erased bytes and which are at the same time colored red. Decoding is carried out sequentially, layer-by-layer, in order of increasing IS.
+
+For example:
+
+
+
+***Implementation and Evaluation***: This work introduce the notion of **sub-chunking** to enable use of **vector erasure codes** with Ceph. For implementations of various MDS codes and Galois-field arithmetic, it uses **Jerasure** and **GF-Complete** libraries.
+In its experiments, it does evaluations in Amazon EC2. Those experiments are carried out on both **fixed** and **variable object-size** workloads. Measurements are made of:
+
+> 1. repair network traffic
+> 2. repair disk read
+> 3. repair time
+> 4. encoding time
+> 5. I/O performance for degraded, normal operations.
+
+## Strength (Contributions of the paper)
+1. introduce the construction of Clay codes
+2. do the modification of Ceph to support any vector code
+3. the integration of Clay codes as a plugin to Ceph
+
+## Weakness (Limitations of the paper)
+
+
+## Future Work
+
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Degraded-First-Scheduling-DSN'14.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Degraded-First-Scheduling-DSN'14.md
old mode 100644
new mode 100755
index cb1b693..0519946
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Degraded-First-Scheduling-DSN'14.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Degraded-First-Scheduling-DSN'14.md
@@ -1,36 +1,36 @@
----
-typora-copy-images-to: paper_figure
----
-# Degraded-First Scheduling for MapReduce in Erasure-Coded Storage System
-@DSN'14 @degraded read schedule
-[TOC]
-
-## Summary
-***Motivation of this paper***: There remains an open issue of how to customize the data analytics paradigm (e.g. MapReduce) when they operate in failure mode and need to perform degraded read.
-> Based on the observation: while the local tasks are running, the MapReduce job does not fully utilize the available network resources.
-
-
-***Degraded-first scheduling***
-- **Main idea**: schedule some degraded tasks at the earlier stages of a MapReduce job and allow them to download data first using the unused network resources.
->1. to move the launch of some degraded tasks ahead to take advantage of the unused network resources, so as to relieve the competition for network resources among degraded tasks later.
->2. the default locality-first scheduling launches degraded tasks at the end, thereby making them **compete for network resources**.
-
-- This paper conducts simple mathematical analysis to compare the default locality-first scheduling and its basic degraded-first scheduling in terms of the runtime of a MapReduce job.
-> provide preliminary insights into the potnetial improvement of degraded-first scheduling in failure mode.
-
-***Implementation and Evaluation***:
-**Implementation**:
-This paper implements degraded-first scheduling by modifying the source code of Hadoop 0.22.0.
-**Evaluation**:
-1. Comparison of MapReduce Runtime
-> Single-job scenario vs Multi-job scenario
-
-## Strength (Contributions of the paper)
-1. This paper porposes the degraded-first scheduling, a new MapReduce scheduling scheme that improves MapReduce performance in erasure-coded clustered storage system operating in failure mode.
-2. It also implements a discrete event simulator for MapReduce to explore the performance gain of degraded-first scheduling in a large-scale cluster.
-3. This paper implements degraded-first scheduling on Hadoop and compare the performance of locality-first scheduling and degraded-first scheduling in a 13-node Hadoop cluster.
-
-## Weakness (Limitations of the paper)
-1. I think the idea of this paper is not very novel and intutive.
-2. This experiment of this paper is not very enough.
-
+---
+typora-copy-images-to: paper_figure
+---
+# Degraded-First Scheduling for MapReduce in Erasure-Coded Storage System
+@DSN'14 @degraded read schedule
+[TOC]
+
+## Summary
+***Motivation of this paper***: There remains an open issue of how to customize the data analytics paradigm (e.g. MapReduce) when they operate in failure mode and need to perform degraded read.
+> Based on the observation: while the local tasks are running, the MapReduce job does not fully utilize the available network resources.
+
+
+***Degraded-first scheduling***
+- **Main idea**: schedule some degraded tasks at the earlier stages of a MapReduce job and allow them to download data first using the unused network resources.
+>1. to move the launch of some degraded tasks ahead to take advantage of the unused network resources, so as to relieve the competition for network resources among degraded tasks later.
+>2. the default locality-first scheduling launches degraded tasks at the end, thereby making them **compete for network resources**.
+
+- This paper conducts simple mathematical analysis to compare the default locality-first scheduling and its basic degraded-first scheduling in terms of the runtime of a MapReduce job.
+> provide preliminary insights into the potnetial improvement of degraded-first scheduling in failure mode.
+
+***Implementation and Evaluation***:
+**Implementation**:
+This paper implements degraded-first scheduling by modifying the source code of Hadoop 0.22.0.
+**Evaluation**:
+1. Comparison of MapReduce Runtime
+> Single-job scenario vs Multi-job scenario
+
+## Strength (Contributions of the paper)
+1. This paper porposes the degraded-first scheduling, a new MapReduce scheduling scheme that improves MapReduce performance in erasure-coded clustered storage system operating in failure mode.
+2. It also implements a discrete event simulator for MapReduce to explore the performance gain of degraded-first scheduling in a large-scale cluster.
+3. This paper implements degraded-first scheduling on Hadoop and compare the performance of locality-first scheduling and degraded-first scheduling in a 13-node Hadoop cluster.
+
+## Weakness (Limitations of the paper)
+1. I think the idea of this paper is not very novel and intutive.
+2. This experiment of this paper is not very enough.
+
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/EC Store-ICDCS'18.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/EC Store-ICDCS'18.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ECPipe-ATC'17.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ECPipe-ATC'17.md
old mode 100644
new mode 100755
index fcb692d..870c127
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ECPipe-ATC'17.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ECPipe-ATC'17.md
@@ -1,52 +1,52 @@
----
-typora-copy-images-to: paper_figure
----
-# Repair Pipelining for Erasure-Coded Storage
-@ATC'17 @ECPipe
-[TOC]
-
-## Summary
-***Motivation of this paper***: While many paper tried to reduce the repair time effectively, it still remains higher than the normal read time in general. So this paper wants to solve the question that "Can it further reduce the repair time of erasure code to almost the same as the normal read time?"
-In the previous work called PPR, it solves the problem that the bandwidth usage distribution is highly skewed. However, it still remains not fully balanced.
-
-### ECpipe
-#### 1. Main Idea
-In this paper, it assumes **the network bandwidth is the bottleneck**. And Repair pipelining is designed for speeding up the repair of a single failed block per stripe.
-The main argured point:
-> The amplification of repair traffic implies the congestion ar the **downlink** of the requestor, thereby increasing the overall repair time.
-
-The goal of ECPipe:
-1. eliminating bottlenecked links (no link transmits more traffic than others)
-2. effectively utilizing bandwith resoureces during repair (i.e., links should not be idle for most times)
-3. Repair Pipelining is designed for speeding up the repair of **a single block** failed per stripe.
-
-#### 2. Motivation
-the drawback of conventional repair is the bandwidth usage distribution is highly **skewed**
-> the download link of the requestor is highly congested, while the links among helpers are not fully utilized.
-
-
-
-***Implementation and Evaluation***:
-
-## Strength (Contributions of the paper)
-1. This paper addresses two types of repair operations, including **degraded reads** and **full-node recovery**. And the repair pipelining can achieves $O(1)$ repair time in homogeneous environments.
-2. It also considers the case in heterogeneous environments (i.e., link bandwidths are different)
->a. parallel reads of reconstructed data
->b. find an optimal repair path across storage nodes
-
-3. Implement a repair pipelining prototype, which runs as a middleware layer atop an existing storage system and performs repair operations on behalf of the storage system.
-4. Evaluate repair pipelining on a local cluster and two geo-distributed Amazon EC2 clusters
-## Weakness (Limitations of the paper)
-1. If a stripe has multiple failed blocks, it triggers a multi-failure repair, which resorts to conventional repair. So this paper not addresses the problem of how to solve it by using pipelining.
-## Future Works
-
-
-## Reviewer Feedback
-### Weaknesses
-- The authors argue PPR does not evenly distribute the network bandwidth. But it is easily fixable given the linearity of earsure codes.
-- Chaining storage nodes in a pipeline is also dangerous, any link failure between the helpers or slowdown of any helper can now affect the whole pipleline.
-- Contributions are not necessarily novel.
-- The implementation is weak, as a standalone module in oreder to make implementation easier. A direct implementation inside HDFS and QFS and making them accepted into upstream open source would be a much stronger contribution, because I think this work has practical value.
-- Over-simplified assumptions (e.g., ignoring computation and memory overhead, static and uniform network links) make it more like a research prototype instead of a readily deployble product.
-- Netowrk bandwidth as a bottleneck
-- Realistic considerations
+---
+typora-copy-images-to: paper_figure
+---
+# Repair Pipelining for Erasure-Coded Storage
+@ATC'17 @ECPipe
+[TOC]
+
+## Summary
+***Motivation of this paper***: While many paper tried to reduce the repair time effectively, it still remains higher than the normal read time in general. So this paper wants to solve the question that "Can it further reduce the repair time of erasure code to almost the same as the normal read time?"
+In the previous work called PPR, it solves the problem that the bandwidth usage distribution is highly skewed. However, it still remains not fully balanced.
+
+### ECpipe
+#### 1. Main Idea
+In this paper, it assumes **the network bandwidth is the bottleneck**. And Repair pipelining is designed for speeding up the repair of a single failed block per stripe.
+The main argured point:
+> The amplification of repair traffic implies the congestion ar the **downlink** of the requestor, thereby increasing the overall repair time.
+
+The goal of ECPipe:
+1. eliminating bottlenecked links (no link transmits more traffic than others)
+2. effectively utilizing bandwith resoureces during repair (i.e., links should not be idle for most times)
+3. Repair Pipelining is designed for speeding up the repair of **a single block** failed per stripe.
+
+#### 2. Motivation
+the drawback of conventional repair is the bandwidth usage distribution is highly **skewed**
+> the download link of the requestor is highly congested, while the links among helpers are not fully utilized.
+
+
+
+***Implementation and Evaluation***:
+
+## Strength (Contributions of the paper)
+1. This paper addresses two types of repair operations, including **degraded reads** and **full-node recovery**. And the repair pipelining can achieves $O(1)$ repair time in homogeneous environments.
+2. It also considers the case in heterogeneous environments (i.e., link bandwidths are different)
+>a. parallel reads of reconstructed data
+>b. find an optimal repair path across storage nodes
+
+3. Implement a repair pipelining prototype, which runs as a middleware layer atop an existing storage system and performs repair operations on behalf of the storage system.
+4. Evaluate repair pipelining on a local cluster and two geo-distributed Amazon EC2 clusters
+## Weakness (Limitations of the paper)
+1. If a stripe has multiple failed blocks, it triggers a multi-failure repair, which resorts to conventional repair. So this paper not addresses the problem of how to solve it by using pipelining.
+## Future Works
+
+
+## Reviewer Feedback
+### Weaknesses
+- The authors argue PPR does not evenly distribute the network bandwidth. But it is easily fixable given the linearity of earsure codes.
+- Chaining storage nodes in a pipeline is also dangerous, any link failure between the helpers or slowdown of any helper can now affect the whole pipleline.
+- Contributions are not necessarily novel.
+- The implementation is weak, as a standalone module in oreder to make implementation easier. A direct implementation inside HDFS and QFS and making them accepted into upstream open source would be a much stronger contribution, because I think this work has practical value.
+- Over-simplified assumptions (e.g., ignoring computation and memory overhead, static and uniform network links) make it more like a research prototype instead of a readily deployble product.
+- Netowrk bandwidth as a bottleneck
+- Realistic considerations
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Encoding-aware-DSN'15.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Encoding-aware-DSN'15.md
old mode 100644
new mode 100755
index 4c40c3b..2af2c76
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Encoding-aware-DSN'15.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Encoding-aware-DSN'15.md
@@ -1,46 +1,46 @@
----
-typora-copy-images-to: paper_figure
----
-# Enabling Efficient and Reliable Transition from Replication to Erasure Coding for Clustered File Systems
-@DSN'15 @Encoding-aware replication
-[TOC]
-
-## Summary
-***Motivation of this paper***: When data blocks are first stored with replication, replica placement plays a critical role in determining both performance and availability of the subsequent encoding operation,
-> *random replication (RR)* brings both performance and availability issues to the subsequent encoding operation.
-
-***Encoding-Aware Replication***
-- **Main Idea**: For each group of data blocks to be encoded together, EAR keeps one replica of each data block in the same rack, while storing the remaining replicas in other racks by equivalently solving a maximum matching problem.
-> It assumes that cross-rack data transfer is the performance bottleneck in a CFS architecture.
-
-- RR potentially harms performance and availability.
-> The primary reason is the replica layout of each data block is independently determined, while the data blocks are actually related when they are encoded together.
-> 
-
-**Eliminate Cross-Rack Downloads**
-1. Formation of a stripe: blocks with at least one replica stored in the same rack.\
-> call this rack the **core rack**.
-
-**Guarantee the reliability requirements without relocation**
-1. Model the replica layout to a bipartite graph.
-2. Modeling Reliability problem to a **Max matching** $\rightarrow$ a **Max flow problem** in bipartite graph.
-
-***Implementation and Evaluation***:
-**Implementation**:
-
-**Evaluation**:
-1. Testbed Experiment: 13-nodes Hadoop cluster
->a. Encoding Throughput
->b. Write Reponse Time
->c. Impact on MapReduce Jobs
-
-2. Discrete-Event Simulations
-## Strength (Contributions of the paper)
-1. This paper presents EAR, a new replica placement algorithm that addresses both performance and availability issues encoding operation.
-2. It implements EAR on HDFS implementation, with only a few modification to the source code.
-3. It conducts testbed experiments on a 13-machine cluster and discrete-event simulations based on CSIM 20.
-4. Examine the replica distribution of EAR, and show that it maintains load balancing in storage and read requests as in RR.
-## Weakness (Limitations of the paper)
-1. This paper considers a very simple scenarios, and ignores the heterogeneous workloads and hardware resources.
-## Future Works
+---
+typora-copy-images-to: paper_figure
+---
+# Enabling Efficient and Reliable Transition from Replication to Erasure Coding for Clustered File Systems
+@DSN'15 @Encoding-aware replication
+[TOC]
+
+## Summary
+***Motivation of this paper***: When data blocks are first stored with replication, replica placement plays a critical role in determining both performance and availability of the subsequent encoding operation,
+> *random replication (RR)* brings both performance and availability issues to the subsequent encoding operation.
+
+***Encoding-Aware Replication***
+- **Main Idea**: For each group of data blocks to be encoded together, EAR keeps one replica of each data block in the same rack, while storing the remaining replicas in other racks by equivalently solving a maximum matching problem.
+> It assumes that cross-rack data transfer is the performance bottleneck in a CFS architecture.
+
+- RR potentially harms performance and availability.
+> The primary reason is the replica layout of each data block is independently determined, while the data blocks are actually related when they are encoded together.
+> 
+
+**Eliminate Cross-Rack Downloads**
+1. Formation of a stripe: blocks with at least one replica stored in the same rack.\
+> call this rack the **core rack**.
+
+**Guarantee the reliability requirements without relocation**
+1. Model the replica layout to a bipartite graph.
+2. Modeling Reliability problem to a **Max matching** $\rightarrow$ a **Max flow problem** in bipartite graph.
+
+***Implementation and Evaluation***:
+**Implementation**:
+
+**Evaluation**:
+1. Testbed Experiment: 13-nodes Hadoop cluster
+>a. Encoding Throughput
+>b. Write Reponse Time
+>c. Impact on MapReduce Jobs
+
+2. Discrete-Event Simulations
+## Strength (Contributions of the paper)
+1. This paper presents EAR, a new replica placement algorithm that addresses both performance and availability issues encoding operation.
+2. It implements EAR on HDFS implementation, with only a few modification to the source code.
+3. It conducts testbed experiments on a 13-machine cluster and discrete-event simulations based on CSIM 20.
+4. Examine the replica distribution of EAR, and show that it maintains load balancing in storage and read requests as in RR.
+## Weakness (Limitations of the paper)
+1. This paper considers a very simple scenarios, and ignores the heterogeneous workloads and hardware resources.
+## Future Works
1. I think it can further study the scenarios with heterogeneous workload and hardware resources.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Lazy Means Smart-SYSTOR'14.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Lazy Means Smart-SYSTOR'14.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/OpenEC-FAST'19.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/OpenEC-FAST'19.md
old mode 100644
new mode 100755
index 2243942..b810bca
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/OpenEC-FAST'19.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/OpenEC-FAST'19.md
@@ -1,33 +1,33 @@
-# OpenEC: Toward Unified and Configurable Erasure Coding Management in Distributed Storage Systems
-@Erasure code @ongoing work
-[TOC]
-
-## Summary
-***Motivation of this paper***:
-Integrating new erasure coding solutions into existing distributed storage systems is arguably a challenging task that requires non-trivial re-engineering of the underlying storage workflows.
-Limitation of EC management
->1. Limited support for adding advanced erasure codes
->2. Limited configurability for workflows of coding operations
->3. Limited configurability for placement of coding operations
-
-***OpenEC***:
-- Main Idea:
-Its main idea is to decouple erasure coding management from the underlying DSS operations
-First, this paper design a new erasure coding programming model which is not based on low-level data buffers but based on high-level abstractions, called **ECDAG**. ECDAG is a directed graph to describe the erasure coding process. It has freedom to be enforced with different underlined physical meanings. ECDAG provides the opportunity for control flow and data flow design in erasure coding process.
-> Centralized application, distributed application
-
-For the OpenEC design, it foucs on two types design: **Online EC design** and **Offline EC design**. And it also provides three levels of optimimzation to custom ECDAG designs for offline erasure coding.
->1. Level 0: leverage the **Bind** to avoid the redundant traffic in distributed application of ECDAG.
->2. Level 1: In offline erasure coding, OpenEC leverages the **AddConstraint** takes the locality into consideration and choose location from the locations of all the childs.
->3. Level 2: The optimization works in **hierarchical data center**. OpenEC applies pipelining technique. And it can sort the children according to their regions to check whether layering can be applied to reduce the cross region network traffic.
-
-## Strength (Contributions of the paper)
-1. this paper propose a new programming model for EC implementation and deployment **ECDAG**, which defines the flows if erasure coding operations as a directed acyclic graph.
-2. this paper also designs OpenEC, which translates ECDAGs into erasure coding operations atop a DSS. In particular, OpenEC can self-customize ECDAGs for hierarchical topologies to improve repair performance.
-3. it also prototypes OpenEC on HDFS-RAID and Hadoop 3.0 HDFS.
-4. It conducts extensive experiments on OpenEC in a local cluster and Amazon EC2.
-
-## Weakness (Limitations of the paper)
-
-## Future Work
-
+# OpenEC: Toward Unified and Configurable Erasure Coding Management in Distributed Storage Systems
+@Erasure code @ongoing work
+[TOC]
+
+## Summary
+***Motivation of this paper***:
+Integrating new erasure coding solutions into existing distributed storage systems is arguably a challenging task that requires non-trivial re-engineering of the underlying storage workflows.
+Limitation of EC management
+>1. Limited support for adding advanced erasure codes
+>2. Limited configurability for workflows of coding operations
+>3. Limited configurability for placement of coding operations
+
+***OpenEC***:
+- Main Idea:
+Its main idea is to decouple erasure coding management from the underlying DSS operations
+First, this paper design a new erasure coding programming model which is not based on low-level data buffers but based on high-level abstractions, called **ECDAG**. ECDAG is a directed graph to describe the erasure coding process. It has freedom to be enforced with different underlined physical meanings. ECDAG provides the opportunity for control flow and data flow design in erasure coding process.
+> Centralized application, distributed application
+
+For the OpenEC design, it foucs on two types design: **Online EC design** and **Offline EC design**. And it also provides three levels of optimimzation to custom ECDAG designs for offline erasure coding.
+>1. Level 0: leverage the **Bind** to avoid the redundant traffic in distributed application of ECDAG.
+>2. Level 1: In offline erasure coding, OpenEC leverages the **AddConstraint** takes the locality into consideration and choose location from the locations of all the childs.
+>3. Level 2: The optimization works in **hierarchical data center**. OpenEC applies pipelining technique. And it can sort the children according to their regions to check whether layering can be applied to reduce the cross region network traffic.
+
+## Strength (Contributions of the paper)
+1. this paper propose a new programming model for EC implementation and deployment **ECDAG**, which defines the flows if erasure coding operations as a directed acyclic graph.
+2. this paper also designs OpenEC, which translates ECDAGs into erasure coding operations atop a DSS. In particular, OpenEC can self-customize ECDAGs for hierarchical topologies to improve repair performance.
+3. it also prototypes OpenEC on HDFS-RAID and Hadoop 3.0 HDFS.
+4. It conducts extensive experiments on OpenEC in a local cluster and Amazon EC2.
+
+## Weakness (Limitations of the paper)
+
+## Future Work
+
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ProactiveLatency-SoCC'17.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ProactiveLatency-SoCC'17.md
old mode 100644
new mode 100755
index 02ffff3..cf764c3
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ProactiveLatency-SoCC'17.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/ProactiveLatency-SoCC'17.md
@@ -1,41 +1,41 @@
----
-typora-copy-images-to: paper_figure
----
-# Latency Reduction and Load Balancing in Coded Storage Systems
-@SoCC'17 @Proactive Latency Reduction
-[TOC]
-
-## Summary
-***Motivation of this paper***: the rigid load balancing schemes, i.e., passive recovery after timeout, is the major cause for long latency tails in erasure coded storage, especially in the presence of skewed demands.
-
-***Load Balance in Coded Storage***:
-This paper intends to let system intentionally and intelligently perform degraded reads based on **demand informatoin** and **load statistics** in the system to direct requests away from hot servers. It would **proactively** launch a degraded read ($S_2$, $S_3$ and $L_1$) to reconstruct the requested object in the first place, and both the request latency and the load of server $S_1$ can be reduced.
-
-
-- A key question: whether to serve a request with a normal read or a degraded read and which servers to serve the degraded read.
-- 1. **Statisitical Optimization**: In the model of this paper, the **load balancing metric $F()$** is defined as the function of the **expected loads $L$** on all storage nodes plus the **exisiting queues pending $Q$** on them.
-$$
-minimize: \quad F()=\vec {L}+ \vec {Q}
-$$
-
-- 2. **Per-Request Optimal Decisions**: A more direct approach is to instantaneously probe the queue status of related data nodes and to make an optimal decision for each request. Then, this paper needs a criterion to measure how good a load direction choice is.
-> a. **Least Latency First**: it only optimizes the latency of the current request in question regardless of future requests. It first probes queue status and optimizes for each request instantaneously.
-
-> b. **Least Marginak Load First**: it strikes a balance between reducing the current request latency and minimizing the overall system load. It first not only optimizes for each request with instantaneous probing, but also saves system resources for future requests by penalizing degraded reads.
-
-***Implementation and Evaluation***:
-
-
-Each frontend server has a **controller** to execute load direction policies
-Computation of *load direction table*: MOSEK in python.
-**Evaluation**:
->1. (6, 2, 2) LRC: request latencies, task latencies, task waiting times, controller processing times
->2. (6, 3) RS: request latencies, task latencies, task waiting times, controller processing times
-
-## Strength (Contributions of the paper)
-1. This paper propses to proactively and intelligently launch degraded reads in order to shift loads away from hotspots amd prevent potential congestion early.
-## Weakness (Limitations of the paper)
-1. The result of statistical optimization is still **sub-optimal**, because the load direction table is only updated periodically, and failing to utilize the instantaneous load information.
-2. The probing overhead could be a bottleneck of its performance.
-## Future Works
-1. I thinks a serious issue in this paper is the overhead of probing in each storage node. Although this paper tries to leverage the sample to reduce the probing overhead, it can be further investigated other method to minish this kind of overhead.
+---
+typora-copy-images-to: paper_figure
+---
+# Latency Reduction and Load Balancing in Coded Storage Systems
+@SoCC'17 @Proactive Latency Reduction
+[TOC]
+
+## Summary
+***Motivation of this paper***: the rigid load balancing schemes, i.e., passive recovery after timeout, is the major cause for long latency tails in erasure coded storage, especially in the presence of skewed demands.
+
+***Load Balance in Coded Storage***:
+This paper intends to let system intentionally and intelligently perform degraded reads based on **demand informatoin** and **load statistics** in the system to direct requests away from hot servers. It would **proactively** launch a degraded read ($S_2$, $S_3$ and $L_1$) to reconstruct the requested object in the first place, and both the request latency and the load of server $S_1$ can be reduced.
+
+
+- A key question: whether to serve a request with a normal read or a degraded read and which servers to serve the degraded read.
+- 1. **Statisitical Optimization**: In the model of this paper, the **load balancing metric $F()$** is defined as the function of the **expected loads $L$** on all storage nodes plus the **exisiting queues pending $Q$** on them.
+$$
+minimize: \quad F()=\vec {L}+ \vec {Q}
+$$
+
+- 2. **Per-Request Optimal Decisions**: A more direct approach is to instantaneously probe the queue status of related data nodes and to make an optimal decision for each request. Then, this paper needs a criterion to measure how good a load direction choice is.
+> a. **Least Latency First**: it only optimizes the latency of the current request in question regardless of future requests. It first probes queue status and optimizes for each request instantaneously.
+
+> b. **Least Marginak Load First**: it strikes a balance between reducing the current request latency and minimizing the overall system load. It first not only optimizes for each request with instantaneous probing, but also saves system resources for future requests by penalizing degraded reads.
+
+***Implementation and Evaluation***:
+
+
+Each frontend server has a **controller** to execute load direction policies
+Computation of *load direction table*: MOSEK in python.
+**Evaluation**:
+>1. (6, 2, 2) LRC: request latencies, task latencies, task waiting times, controller processing times
+>2. (6, 3) RS: request latencies, task latencies, task waiting times, controller processing times
+
+## Strength (Contributions of the paper)
+1. This paper propses to proactively and intelligently launch degraded reads in order to shift loads away from hotspots amd prevent potential congestion early.
+## Weakness (Limitations of the paper)
+1. The result of statistical optimization is still **sub-optimal**, because the load direction table is only updated periodically, and failing to utilize the instantaneous load information.
+2. The probing overhead could be a bottleneck of its performance.
+## Future Works
+1. I thinks a serious issue in this paper is the overhead of probing in each storage node. Although this paper tries to leverage the sample to reduce the probing overhead, it can be further investigated other method to minish this kind of overhead.
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RAFI-ATC'18.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RAFI-ATC'18.md
old mode 100644
new mode 100755
index 6745dbb..a2cf55d
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RAFI-ATC'18.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RAFI-ATC'18.md
@@ -1,51 +1,51 @@
----
-typora-copy-images-to: paper_figure
----
-# RAFI: Risk-Aware Failure Identification to Improve the RAS in Erasure-coded Data Centers
-
-@ATC'18 @Failure Identification
-[TOC]
-
-## Summary
-***Motivation of this paper***: The recovery phase of data repair is widely studies and well optimized, In traditional failure identification scheme, all chunks are share the same identification time threshlod, thus losing the opportunities to further improve the RAS. This paper regards data repair as two phases: **recovery phase** and **identification phase**. And it focuses on the **identification phase**.
-> The RAS cannot be improved simultaneously by adjusting the failure identification time threshold.
-
-***RAFI***
-
-- **Key principle of RAFI**: This is based on the dedicated observation on stripes, and it can be classified into two types:
-> 1. a stripe has many failed chunks (high risk stripe) $\rightarrow$ a shorter identification time threshold
-> 2. a stripe has a few failed chunks (low risk stripe) $\rightarrow$ a longer identification time threshold
-
-For the stripes having many failed chunks, it tunes down the failure identification time threshold of those failed chunks (improve the data availability and reliability $\rightarrow$ increasing the repair network traffic)
-
-For the stripes having a few failed chunks, it tunes up the failure identification time threshold of those failed chunks (reducing the repair network traffic $\rightarrow$ reducing data reliability and availability)
-
-
-
-- The time identification threshold is determined by the **total failed chunks** in the stripes (risk level).
-
-- **Compatibility**: Due to the fact that RAFI focuses on the failure indentification phase, it can work together with existing optimizations which focus on the failure recovery phase.
-
-***Implementation and Evaluation***:
-**Implementation**:
-It proposes a hybrid methodology to comprehensively evaluate RAFI via both **simulation** and **prototype implementation**.
->1. the design details and computational cost of RAFI $\rightarrow$ real distributed storage system
->2. the effectiveness and efficiency of RAFI on the RAS $\rightarrow$ Monte-Carlo simulation.
-
-
-
-**Simulator**:
-This paper also developed a simulator, which is written in R (open-source).
-
-This paper does 1) Functions of Erasure Coding Schemes 2) Functions of Recovery Network Bandwidth 3) Comparisons with *Lazy* all in hte simulator.
-## Strength (Contributions of the paper)
-1. It proposes a risk-aware failure identification scheme RAFI to simultaneously improve the Reliability and Availability and Serviceability (RAS).
-2. It also designs a simulator to verify the RAFI.
-3. It implements the prototype of RAFI in HDFS to verify the correctness and computational cost of it.
-## Weakness (Limitations of the paper)
-1. It postpones the failure identification of chunks in low risk stripes, more failed chunks might be generated, thus increasing degraded reads.
-2. RAFI needs to collect the trace firstly and then do the calculation to get the time threshold, which can be out-of-date.
-3. This paper does not mention how it metric the **reliability, availability, serviceability**.
-
-## Future Works
+---
+typora-copy-images-to: paper_figure
+---
+# RAFI: Risk-Aware Failure Identification to Improve the RAS in Erasure-coded Data Centers
+
+@ATC'18 @Failure Identification
+[TOC]
+
+## Summary
+***Motivation of this paper***: The recovery phase of data repair is widely studies and well optimized, In traditional failure identification scheme, all chunks are share the same identification time threshlod, thus losing the opportunities to further improve the RAS. This paper regards data repair as two phases: **recovery phase** and **identification phase**. And it focuses on the **identification phase**.
+> The RAS cannot be improved simultaneously by adjusting the failure identification time threshold.
+
+***RAFI***
+
+- **Key principle of RAFI**: This is based on the dedicated observation on stripes, and it can be classified into two types:
+> 1. a stripe has many failed chunks (high risk stripe) $\rightarrow$ a shorter identification time threshold
+> 2. a stripe has a few failed chunks (low risk stripe) $\rightarrow$ a longer identification time threshold
+
+For the stripes having many failed chunks, it tunes down the failure identification time threshold of those failed chunks (improve the data availability and reliability $\rightarrow$ increasing the repair network traffic)
+
+For the stripes having a few failed chunks, it tunes up the failure identification time threshold of those failed chunks (reducing the repair network traffic $\rightarrow$ reducing data reliability and availability)
+
+
+
+- The time identification threshold is determined by the **total failed chunks** in the stripes (risk level).
+
+- **Compatibility**: Due to the fact that RAFI focuses on the failure indentification phase, it can work together with existing optimizations which focus on the failure recovery phase.
+
+***Implementation and Evaluation***:
+**Implementation**:
+It proposes a hybrid methodology to comprehensively evaluate RAFI via both **simulation** and **prototype implementation**.
+>1. the design details and computational cost of RAFI $\rightarrow$ real distributed storage system
+>2. the effectiveness and efficiency of RAFI on the RAS $\rightarrow$ Monte-Carlo simulation.
+
+
+
+**Simulator**:
+This paper also developed a simulator, which is written in R (open-source).
+
+This paper does 1) Functions of Erasure Coding Schemes 2) Functions of Recovery Network Bandwidth 3) Comparisons with *Lazy* all in hte simulator.
+## Strength (Contributions of the paper)
+1. It proposes a risk-aware failure identification scheme RAFI to simultaneously improve the Reliability and Availability and Serviceability (RAS).
+2. It also designs a simulator to verify the RAFI.
+3. It implements the prototype of RAFI in HDFS to verify the correctness and computational cost of it.
+## Weakness (Limitations of the paper)
+1. It postpones the failure identification of chunks in low risk stripes, more failed chunks might be generated, thus increasing degraded reads.
+2. RAFI needs to collect the trace firstly and then do the calculation to get the time threshold, which can be out-of-date.
+3. This paper does not mention how it metric the **reliability, availability, serviceability**.
+
+## Future Works
1. I think this paper can further investigate how to do the tradeoff between failure identification time and the number of degraded reads happen, corresponding to the first weakness.
\ No newline at end of file
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RotatedReed-SolomonCodes-FAST'12.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RotatedReed-SolomonCodes-FAST'12.md
old mode 100644
new mode 100755
index e2fe3be..d4d5ce2
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RotatedReed-SolomonCodes-FAST'12.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/RotatedReed-SolomonCodes-FAST'12.md
@@ -1,44 +1,44 @@
----
-typora-copy-images-to: paper_figure
----
-# Rethinking Erasure Codes for Cloud File Systems: Minimizing I/O for Recovery and Degraded Reads
-@FAST'12 @ Minimizing Recovery I/O
-[TOC]
-
-## Summary
-***Motivation of this paper***: Existing erasure codes are not designed with recovery I/O optimization in mind. So it needs
->1. optimize existing codes for these operations
->2. new codes which are intrinsically designed for these operations
-
-***Algorithm for minimizing number of symbols***:
-The algorithm takes as input a **Generator matrix** whose symbols are single bits and the identity of a failed disk and outputs equations to decode each failed symbol.
-This algorithm firstly finds a decoding equation for each failed bit while minimizing the number of total symbols accessed. Given a **code generator matrix** and a list of failed symbols, the algorithm outputs **decoding equations** to recover each failed symbol.
-> 1. enumerate all valid decoding equations for each failed symbol
-> 2. **directed graph** formulation of problem makes it convenient to solve.*(convert this NP-Hard problem to finding the shortest path through the graph)*
->
-
-For the expensive computational overhead, solutions to all common failure common failure combinations may be computed offline **a prior** and stored for future use.
-
-***Rotated Reed-Solomon Codes***: Rotated Reed-Solomon Codes have the modification to standard Reed-Solomon codes. These codes have been designed to optimize the performance of degraded reads for single disk failures. (**improve the penalty of degraded reads**)
-
-
-
-***Analysis and Evaluation***:
-1. Reading from the $P$ drive or using standard Reed-Solomon codes is not a good idea in cloud storage system.
-2. Generally, optimally-sparse and minimum-density codes perform best for disk reconstruction.
-**Evaluation**:
-1. Data Recovery Rate v.s. Symbol Size
-2. Data Recovery Rate v.s. Different erasure code
-
-
-## Strength (Contributions of the paper)
-1. Algorithm minimizes the amount of data needed for recovery and it is applicable to **any XOR** based erasure code.
-2. Its implementation and evaluation of this algorithm demonstrates that minimizing recovery data translates directly into improved I/O performance for cloud file systems.
-3. It develops a new class of codes, called Rotated Reed-Solomon codes. A new class of Reed-Solomon Codes which optimize degraded read performance. Better choice than standard Reed-Solomon codes for the cloud. (**arbitrary numbers of disks and failures**)
-
-## Weakness (Limitations of the paper)
-1. This paper mentions that its algorithm is computationally expensive (from seconds to hours of compute-time)
-
-## Future Work
-1. This paper can be extended by considering how to decrease the computation overhead instead of computing offline **a prior** and storing it for future use.
-
+---
+typora-copy-images-to: paper_figure
+---
+# Rethinking Erasure Codes for Cloud File Systems: Minimizing I/O for Recovery and Degraded Reads
+@FAST'12 @ Minimizing Recovery I/O
+[TOC]
+
+## Summary
+***Motivation of this paper***: Existing erasure codes are not designed with recovery I/O optimization in mind. So it needs
+>1. optimize existing codes for these operations
+>2. new codes which are intrinsically designed for these operations
+
+***Algorithm for minimizing number of symbols***:
+The algorithm takes as input a **Generator matrix** whose symbols are single bits and the identity of a failed disk and outputs equations to decode each failed symbol.
+This algorithm firstly finds a decoding equation for each failed bit while minimizing the number of total symbols accessed. Given a **code generator matrix** and a list of failed symbols, the algorithm outputs **decoding equations** to recover each failed symbol.
+> 1. enumerate all valid decoding equations for each failed symbol
+> 2. **directed graph** formulation of problem makes it convenient to solve.*(convert this NP-Hard problem to finding the shortest path through the graph)*
+>
+
+For the expensive computational overhead, solutions to all common failure common failure combinations may be computed offline **a prior** and stored for future use.
+
+***Rotated Reed-Solomon Codes***: Rotated Reed-Solomon Codes have the modification to standard Reed-Solomon codes. These codes have been designed to optimize the performance of degraded reads for single disk failures. (**improve the penalty of degraded reads**)
+
+
+
+***Analysis and Evaluation***:
+1. Reading from the $P$ drive or using standard Reed-Solomon codes is not a good idea in cloud storage system.
+2. Generally, optimally-sparse and minimum-density codes perform best for disk reconstruction.
+**Evaluation**:
+1. Data Recovery Rate v.s. Symbol Size
+2. Data Recovery Rate v.s. Different erasure code
+
+
+## Strength (Contributions of the paper)
+1. Algorithm minimizes the amount of data needed for recovery and it is applicable to **any XOR** based erasure code.
+2. Its implementation and evaluation of this algorithm demonstrates that minimizing recovery data translates directly into improved I/O performance for cloud file systems.
+3. It develops a new class of codes, called Rotated Reed-Solomon codes. A new class of Reed-Solomon Codes which optimize degraded read performance. Better choice than standard Reed-Solomon codes for the cloud. (**arbitrary numbers of disks and failures**)
+
+## Weakness (Limitations of the paper)
+1. This paper mentions that its algorithm is computationally expensive (from seconds to hours of compute-time)
+
+## Future Work
+1. This paper can be extended by considering how to decrease the computation overhead instead of computing offline **a prior** and storing it for future use.
+
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/SpeedupXOR-MSST'12 .md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/SpeedupXOR-MSST'12 .md
old mode 100644
new mode 100755
index 1003391..d21a3a8
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/SpeedupXOR-MSST'12 .md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/SpeedupXOR-MSST'12 .md
@@ -1,45 +1,45 @@
----
-typora-copy-images-to: paper_figure
----
-# On the Speedup of Single-Disk Failure Recovery in XOR-Coded Storage Systems: Theory and Practice
-@MSST'12 @speed up the recovery
-[TOC]
-
-## Summary
-***Motivation of this paper***: While enumeration recovery can find the optimal recovery solutio n for a single-disk failure in any XOR-based erasure codes, it has a very high computational overhead. Thus,*enumeration recovery* is **infeasible** to deploy (Limited hardware resources, Remote recovery scenario, Online recovery scenario). This paper seeks to minimize the number of symbols being read (or I/Os) from the surviving disks for the single-disk failure recovery and its objective is **the single-disk failure recovery**.
-
-***Replace Recovery Algorithm***:
-The bottleneck of enumeration recovery is the huge search space of recovery equations. Thus, its goal is to achieve following objectives
->1. Search efficiency
->2. Effective recovery performance
->3. Adaptable to heterogenous disk capabilities
-
-In its simple recovery model, there exists a recovery solution that contains exactly $w$ parity symbols for regenerating $w$ lost data symbols for each stripe in a single-disk failure. In summary, its replace recovery algorithm is:
->replace "some" parity symbols in a collection of $w$ parity symbols with the **same number** of parity symbols in the set of parity symbols in the $i-th$ parity disk (r epeat by resetting it with other parity symbols)
-
-It also needs **a primitive function** that determines if the collection set is **valid** to resolve the $w$ data symbols after being replaced with other parity symbols in the collection of $w$ parity that are considered be included in the collection. And the complexity of total search is $O(mw^3)$, which is **polynomial-time**.
-
-Besides this basic approach, this paper also mentions how to **enhance the likelihood** for its replace search to achieve the optimal point.
-
-***Evaluation and Implementation***: It implements a **parallel recovery architecture** that parallelizes the recovery operation via multi-threaded and multi-server designs.
-
-Evaluation:
->1. Impact of chunk size
->2. Recovery time performance
->3. Parallel recovery
-
-## Strength (Contributions of the paper)
-1. It proposes a **replace recovery algorithm**, which uses a hill-climbing (greedy) approach to optimize the recovery solution.
-2. From a theoretical perspective, this paper proposes a replace recovery algorithm that provides near-optimal recovery performance for STAR and CRS codes, while the algorithm has a polynomial computational complexity
-3. From a practical perspective, it designs and implements its replace recovery algorithm on top of a parallel recovery architecture for scalable recovery performance.
-
-
-## Weakness (Limitations of the paper)
-1. This paper just consider the case that minimizing the number of symbols read. How about other optimization objectives.
-2. This paper just consider the case of single disk failure, how about an arbitrary number of disk failures
-
-## Future Work
-1. This paper can evaluate the performance of its replace recovery approach for other optimization objectives, and explore its applicability for general XOR-based erasure codes.
-2. How to extend this method to the case of an arbitrary number of disk failures is also a direction
-
-
+---
+typora-copy-images-to: paper_figure
+---
+# On the Speedup of Single-Disk Failure Recovery in XOR-Coded Storage Systems: Theory and Practice
+@MSST'12 @speed up the recovery
+[TOC]
+
+## Summary
+***Motivation of this paper***: While enumeration recovery can find the optimal recovery solutio n for a single-disk failure in any XOR-based erasure codes, it has a very high computational overhead. Thus,*enumeration recovery* is **infeasible** to deploy (Limited hardware resources, Remote recovery scenario, Online recovery scenario). This paper seeks to minimize the number of symbols being read (or I/Os) from the surviving disks for the single-disk failure recovery and its objective is **the single-disk failure recovery**.
+
+***Replace Recovery Algorithm***:
+The bottleneck of enumeration recovery is the huge search space of recovery equations. Thus, its goal is to achieve following objectives
+>1. Search efficiency
+>2. Effective recovery performance
+>3. Adaptable to heterogenous disk capabilities
+
+In its simple recovery model, there exists a recovery solution that contains exactly $w$ parity symbols for regenerating $w$ lost data symbols for each stripe in a single-disk failure. In summary, its replace recovery algorithm is:
+>replace "some" parity symbols in a collection of $w$ parity symbols with the **same number** of parity symbols in the set of parity symbols in the $i-th$ parity disk (r epeat by resetting it with other parity symbols)
+
+It also needs **a primitive function** that determines if the collection set is **valid** to resolve the $w$ data symbols after being replaced with other parity symbols in the collection of $w$ parity that are considered be included in the collection. And the complexity of total search is $O(mw^3)$, which is **polynomial-time**.
+
+Besides this basic approach, this paper also mentions how to **enhance the likelihood** for its replace search to achieve the optimal point.
+
+***Evaluation and Implementation***: It implements a **parallel recovery architecture** that parallelizes the recovery operation via multi-threaded and multi-server designs.
+
+Evaluation:
+>1. Impact of chunk size
+>2. Recovery time performance
+>3. Parallel recovery
+
+## Strength (Contributions of the paper)
+1. It proposes a **replace recovery algorithm**, which uses a hill-climbing (greedy) approach to optimize the recovery solution.
+2. From a theoretical perspective, this paper proposes a replace recovery algorithm that provides near-optimal recovery performance for STAR and CRS codes, while the algorithm has a polynomial computational complexity
+3. From a practical perspective, it designs and implements its replace recovery algorithm on top of a parallel recovery architecture for scalable recovery performance.
+
+
+## Weakness (Limitations of the paper)
+1. This paper just consider the case that minimizing the number of symbols read. How about other optimization objectives.
+2. This paper just consider the case of single disk failure, how about an arbitrary number of disk failures
+
+## Future Work
+1. This paper can evaluate the performance of its replace recovery approach for other optimization objectives, and explore its applicability for general XOR-based erasure codes.
+2. How to extend this method to the case of an arbitrary number of disk failures is also a direction
+
+
diff --git a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Two_Tale_in_HDFS-FAST'15.md b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Two_Tale_in_HDFS-FAST'15.md
old mode 100644
new mode 100755
index 29d7652..1213bf6
--- a/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Two_Tale_in_HDFS-FAST'15.md
+++ b/StoragePaperNote/ErasureCoding/Techniques for Erasure Coding/Two_Tale_in_HDFS-FAST'15.md
@@ -1,50 +1,50 @@
-# A Tale of Two Erasure Codes in HDFS
-
-@FAST'15 @Techniques for Erasure Codes @Adaptive Coding
-
-[TOC]
-
-## Summary
-
-***Motivation of this paper***: The increase in the amount of data to be read and transferred during recovery for an erasure-coded system results in two major problems: **high degraded read** and **longer reconstruction time**. This paper wants to exploit the **data access characteristics** of Hadoop workloads to achieve better recovery cost and storage efficiency than the existing HDFS architecture.
-
-***Hadoop Adaptively-Coded Distributed File System (HACFS)***:
-HACFS is the first sustem that uses a combination of two codes to dynamically adapt with workload changes and provide both low recovery cost and storage overhead.
-
-- Adaptive coding:
- It designs an adaptive coding module in HDFS and uses a **state machine** to describe the state transitions in HDFS. To maintain this state machine, HAFCS also accounts for the read accesses to data blocks in a file.
-
-> Read hot files with a high read count are encoded with a *fast code*.
-> Read cold files with a low read count are encoded with a *compact code*.
-> 
-
-***Implementation and Evaluation***:
-
-1. Implementation
- This paper has implemented HACFS as an extension to the HDFS-RAID module in the Hadoop Distributed File System. Its key module is the adaptive coding module, which tracks the system states and invokes the different coding interfaces. It includes about 3000 LOCs.
- 
-2. Evaluation
- Compare with Colossus FS, FB HDFS, Azure
-
-- Degraded read latency
-- Reconstruction time
-- Storage overhead
-
-## Strength (Contributions of the paper)
-
-1. This paper designs the Hadoop Adaptively-Coded Distributed File System (HACFS) that adapts to workload changes by using two different erasure codes.
-
-> **fast code**: optimize recovery cost of degraded reads and reconstruction
-> **compact code**: provide low and bounded storage overhead
-
-1. A novel conversion mechanism in HACFS to efficiently up/down-code data blocks between the two codes.
-2. Implement HACFS as an extension to HDFS and demonstrate its efficacy using two cases studies with Product and LRC family codes.
-
-## Weakness (Limitations of the paper)
-
-1. This paper mentions that the conversion cost is high when the HACFS system aggressively converts blocks to limit the storage overhead by upcoding hot files into compact code. So this can be a issue
-
-## Future Work
-
-1. The paper can be extended by solving the problem of how to reduce the conversion cost.
-
+# A Tale of Two Erasure Codes in HDFS
+
+@FAST'15 @Techniques for Erasure Codes @Adaptive Coding
+
+[TOC]
+
+## Summary
+
+***Motivation of this paper***: The increase in the amount of data to be read and transferred during recovery for an erasure-coded system results in two major problems: **high degraded read** and **longer reconstruction time**. This paper wants to exploit the **data access characteristics** of Hadoop workloads to achieve better recovery cost and storage efficiency than the existing HDFS architecture.
+
+***Hadoop Adaptively-Coded Distributed File System (HACFS)***:
+HACFS is the first sustem that uses a combination of two codes to dynamically adapt with workload changes and provide both low recovery cost and storage overhead.
+
+- Adaptive coding:
+ It designs an adaptive coding module in HDFS and uses a **state machine** to describe the state transitions in HDFS. To maintain this state machine, HAFCS also accounts for the read accesses to data blocks in a file.
+
+> Read hot files with a high read count are encoded with a *fast code*.
+> Read cold files with a low read count are encoded with a *compact code*.
+> 
+
+***Implementation and Evaluation***:
+
+1. Implementation
+ This paper has implemented HACFS as an extension to the HDFS-RAID module in the Hadoop Distributed File System. Its key module is the adaptive coding module, which tracks the system states and invokes the different coding interfaces. It includes about 3000 LOCs.
+ 
+2. Evaluation
+ Compare with Colossus FS, FB HDFS, Azure
+
+- Degraded read latency
+- Reconstruction time
+- Storage overhead
+
+## Strength (Contributions of the paper)
+
+1. This paper designs the Hadoop Adaptively-Coded Distributed File System (HACFS) that adapts to workload changes by using two different erasure codes.
+
+> **fast code**: optimize recovery cost of degraded reads and reconstruction
+> **compact code**: provide low and bounded storage overhead
+
+1. A novel conversion mechanism in HACFS to efficiently up/down-code data blocks between the two codes.
+2. Implement HACFS as an extension to HDFS and demonstrate its efficacy using two cases studies with Product and LRC family codes.
+
+## Weakness (Limitations of the paper)
+
+1. This paper mentions that the conversion cost is high when the HACFS system aggressively converts blocks to limit the storage overhead by upcoding hot files into compact code. So this can be a issue
+
+## Future Work
+
+1. The paper can be extended by solving the problem of how to reduce the conversion cost.
+
diff --git a/StoragePaperNote/LongTermAnalysis-MSST'16.md b/StoragePaperNote/LongTermAnalysis-MSST'16.md
old mode 100644
new mode 100755
index b512582..06f6f6e
--- a/StoragePaperNote/LongTermAnalysis-MSST'16.md
+++ b/StoragePaperNote/LongTermAnalysis-MSST'16.md
@@ -1,34 +1,34 @@
----
-typora-copy-images-to: ../paper_figure
----
-A Long-Term User-Centric Analysis of Deduplication Patterns
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| MSST'16 | Deduplication Workload Analysis |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- There are few long-term studies of large backup data sets:
-> most prior studies draw conclusion based on the entire data set
-
-This work extends current studies by using a long-term workload that extends from March 2012 to November 2014.
-> based on a long history, offer new valuable insights
-
-- The characteristics of backup workloads
-Backup workloads tend to have a high churn rate, a lot of stream locality, a high demand for writing, and high redundancy.
-
-
-
-### Long-term user-centric analysis
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
+---
+typora-copy-images-to: ../paper_figure
+---
+A Long-Term User-Centric Analysis of Deduplication Patterns
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| MSST'16 | Deduplication Workload Analysis |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- There are few long-term studies of large backup data sets:
+> most prior studies draw conclusion based on the entire data set
+
+This work extends current studies by using a long-term workload that extends from March 2012 to November 2014.
+> based on a long history, offer new valuable insights
+
+- The characteristics of backup workloads
+Backup workloads tend to have a high churn rate, a lot of stream locality, a high demand for writing, and high redundancy.
+
+
+
+### Long-term user-centric analysis
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
diff --git a/StoragePaperNote/NeXUS-DSN'19.md b/StoragePaperNote/NeXUS-DSN'19.md
old mode 100644
new mode 100755
index a217981..54f9695
--- a/StoragePaperNote/NeXUS-DSN'19.md
+++ b/StoragePaperNote/NeXUS-DSN'19.md
@@ -1,39 +1,39 @@
----
-typora-copy-images-to: ../paper_figure
----
-NEXUS: Practical and Secure Access Control on Untrusted Storage Platforms using Client-side SGX
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| UCC'19 | Data Encryption |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-- Motivation
- - File sharing: allow user to store, share and synchronize files online.
- - Secure issues:
- - frequent data breaches
- - data confidentiality and integrity
- - **Purely cryptographic approaches** incur very high overheads on user revocation.
- - when decrypting files on a client machine, the encryption key is inevitably exposed to the client application and can be cached by the user. (need *re-encryption*)
- - Existing approaches
- - require server-side hardware support (limits their availability for users of personal cloud storage services)
- - should not rely on the service provider
-
-
-### NEXUS
-- Main Idea
- - Leverage Intel SGX to provide efficient access control and policy management
- - client-side cryptographic operations implemented inside an SGX enclave.
- - embeds user-specified access control policies into *files' cryptographically protected metadata*. (enforced by the enclave, smaller attached metadata)
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
+---
+typora-copy-images-to: ../paper_figure
+---
+NEXUS: Practical and Secure Access Control on Untrusted Storage Platforms using Client-side SGX
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| UCC'19 | Data Encryption |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+- Motivation
+ - File sharing: allow user to store, share and synchronize files online.
+ - Secure issues:
+ - frequent data breaches
+ - data confidentiality and integrity
+ - **Purely cryptographic approaches** incur very high overheads on user revocation.
+ - when decrypting files on a client machine, the encryption key is inevitably exposed to the client application and can be cached by the user. (need *re-encryption*)
+ - Existing approaches
+ - require server-side hardware support (limits their availability for users of personal cloud storage services)
+ - should not rely on the service provider
+
+
+### NEXUS
+- Main Idea
+ - Leverage Intel SGX to provide efficient access control and policy management
+ - client-side cryptographic operations implemented inside an SGX enclave.
+ - embeds user-specified access control policies into *files' cryptographically protected metadata*. (enforced by the enclave, smaller attached metadata)
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
diff --git a/StoragePaperNote/PRO-ORAM-RAID'19.md b/StoragePaperNote/PRO-ORAM-RAID'19.md
old mode 100644
new mode 100755
index 3a4086b..d0e6af7
--- a/StoragePaperNote/PRO-ORAM-RAID'19.md
+++ b/StoragePaperNote/PRO-ORAM-RAID'19.md
@@ -1,43 +1,43 @@
----
-typora-copy-images-to: ../paper_figure
----
-PRO-ORAM: Practical Read-Only Oblivious RAM
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| RAID'19 | Data Encryption |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Although encryption of data on the cloud guarantees data confidentiality, it is not sufficient to protect user privacy.
-> Access patterns on encrypted data leak substantial private information such **secret keys** and **user queries**.
-
-A solution to stop this inference is the use of Oblivious RAM (ORAM)
-> continuously shuffle the encrypted data blocks to avoid information leakage via the data access patterns.
-
-A large number of cloud-based storage services have a **read-only model** of data consumption.
-> offers only read operations after the initial upload (write) of the content to the cloud.
-> Dropbox
-
-- Key Question
-whether it is possible to achieve **constant latency** to hide read-only access?
-
-### PRO-ORAM
-- Main goals
-1. hide read data access patterns on the cloud server.
-2. achieve constant time to access each block from the cloud.
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. This paper proposes a practical and secure **read-only** ORAM design for cloud-based data hosting services.
-> utilizes sufficient computing units equipped with the SGX.
-
-2. It also provides the **security proof** and efficiency evaluation
->
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: ../paper_figure
+---
+PRO-ORAM: Practical Read-Only Oblivious RAM
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| RAID'19 | Data Encryption |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Although encryption of data on the cloud guarantees data confidentiality, it is not sufficient to protect user privacy.
+> Access patterns on encrypted data leak substantial private information such **secret keys** and **user queries**.
+
+A solution to stop this inference is the use of Oblivious RAM (ORAM)
+> continuously shuffle the encrypted data blocks to avoid information leakage via the data access patterns.
+
+A large number of cloud-based storage services have a **read-only model** of data consumption.
+> offers only read operations after the initial upload (write) of the content to the cloud.
+> Dropbox
+
+- Key Question
+whether it is possible to achieve **constant latency** to hide read-only access?
+
+### PRO-ORAM
+- Main goals
+1. hide read data access patterns on the cloud server.
+2. achieve constant time to access each block from the cloud.
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. This paper proposes a practical and secure **read-only** ORAM design for cloud-based data hosting services.
+> utilizes sufficient computing units equipped with the SGX.
+
+2. It also provides the **security proof** and efficiency evaluation
+>
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/PerfectDedup-DataPrivacyManagement'15.md b/StoragePaperNote/PerfectDedup-DataPrivacyManagement'15.md
deleted file mode 100644
index 4da50fb..0000000
--- a/StoragePaperNote/PerfectDedup-DataPrivacyManagement'15.md
+++ /dev/null
@@ -1,41 +0,0 @@
----
-typora-copy-images-to: ../paper_figure
----
-PerfectDedup: Secure Data Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| Data Privacy Management, and Security Assurance'15 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Convergent encryption suffers from various well-known weakness including dictionary attacks
-
-Current countermeasures:
-> popular file: CE
-> unpopular file: random encryption
-
-This paper takes into account the the popularity of data segments
-> leverage the properties of Pefect Hashing in order to assure block-level deduplication and data confidentiality at the same time.
-
-The key challenge of this popularity-based approach
-> design of a secure mechanism to detect the popularity of data segments.
-> how to achieve no information leakage about the popularity of popular files and unpopular files
-
-The key motivation:
-> Popular data do not require the same level of protection as unpopular data and therefore propose different forms of encryption for popular and unpopular data.
-
-### PerfectDedup
-- Main idea
-
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-
diff --git a/StoragePaperNote/PraDa-CIKM'14.md b/StoragePaperNote/PraDa-CIKM'14.md
deleted file mode 100644
index 168939c..0000000
--- a/StoragePaperNote/PraDa-CIKM'14.md
+++ /dev/null
@@ -1,49 +0,0 @@
----
-typora-copy-images-to: ../paper_figure
----
-PraDa: Privacy-preserving Data Deduplication as a Service
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CIKM'14 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper argues the data cleaning is a labor-intensive and complex process.
-> Is extremely challenging (cleaning for large scale data has grown difficult)
-> A solution is to outsource the data to a third-party data cleaning service provider. (cloud based data cleaning services)
-
-In this paper, it regards the data deduplication as the data cleaning problem, and investigates how to protect the privacy information when outsourcing the data to a third-party service provider.
-> the client encodes its data and sends the encoded data to the server.
-> the server is untrusted, may try to decode the received dataset to infer the private information.
-
-- Threat
-1. the attacker may have the frequency distribution information of the outsourced data (may launch frequency analysis)
-2. the attacker knows the details of the encoding scheme (may launch known-scheme attack)
-
-Add a **salt** can prevent frequency analysis attack but it may disable to find duplicated records form the encoded data.
-
-- The difference between this work and data deduplication
-In this work, it emphasizes on the data similarity of records
-> their similarity according to some distance measurement metrics is no less than a given threshold $\delta$ (near-duplicates)
-
-### PraDa
-- Goal
-It enables the client to outsource her data as well as data deduplication needs to a potentially untrusted server.
-> enables the server to discover near duplicated records from the encoded data
-
-- Locality-sensitive hashing based approach (LSHB)
-Idea: map the strings to locality-sensitive hashing (LSH) values.
-> LSH is a set of hash functions that map objects into several buckets such that similar objects share a bucket with high probability.
-> Any two similar objects will have the same LSH value with high probability.
-> **MinHash** function is the LSH function family.
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
diff --git a/StoragePaperNote/PrivacyAtScale-SIGMOD'18.md b/StoragePaperNote/PrivacyAtScale-SIGMOD'18.md
deleted file mode 100644
index f35e82e..0000000
--- a/StoragePaperNote/PrivacyAtScale-SIGMOD'18.md
+++ /dev/null
@@ -1,33 +0,0 @@
----
-typora-copy-images-to: ../paper_figure
----
-Privacy at Scale: Local Differential Privacy in Practice
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SIGMOD'18 | Local Differential Privacy |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper shows an introducation to local differential privacy. It descirbes three practical realizations of LDP algorithms for **collecting popularity statistics**.
-1. RAPPOR from google: combines Randomized Response with Bloom Filters to compactly encode massive sets (CCS'14)
-
-2. Apple differential privacy: using Fourier transform to spread out signal information, and sketching techniques to reduce the dimensionality of the massive domain.
-> identifying differentially private heavy hitters
-> NIPS'17, STOC'15, arXiv'17
-
-3. Microsoft Telemetry collection: make use of histograms and fixed random numbers to collect data over time (NIPS'17)
-
-
-- LDP and DP
-1. DP
-In the standard (or centralized) setting, each user sends **raw data** $v$ to the aggregator, who obtains the true distribution, **add noise**, then publishes the result.
-> In this setting, the aggregator is trusted to not reveal the raw data and is trusted to handle the raw data correctly.
-
-2. LDP
-In this setting, each user perturbs the data **locally**, and thus does not have to trust the aggregator. LDP has a **stronger** privacy model than DP
-> however, entails greater noise.
-
-
-## 2. Future Works
diff --git a/StoragePaperNote/ProofsOwnership-CCS'11.md b/StoragePaperNote/ProofsOwnership-CCS'11.md
old mode 100644
new mode 100755
index 8159e82..0e02550
--- a/StoragePaperNote/ProofsOwnership-CCS'11.md
+++ b/StoragePaperNote/ProofsOwnership-CCS'11.md
@@ -1,53 +1,53 @@
----
-typora-copy-images-to: paper_figure
----
-Proofs of Ownership in Remote Storage Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ACM CCS'11 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Client-side deduplication allows an attacker to gain access to arbitrary-size files of other users based on a very small hash signatures of these file.
-To overcome it, it introduces the notion of proofs-of-ownership (PoWs), which lets a client efficiently prove to a server that that the client holds a file, rather than just some short information about it.
-
-This paper only focuses on **file-level** deduplication.
-### Proofs of Ownership
-- A New Attack
-**Root cause**: By accepting the hash value as a "proxy" for the entire file, the server allows anyone who gets the hash value to get the entire file. (via asking the recovery from the storage service)
-> 1. by learning a small piece of information about the file, namely its hash value, an attacker can get the entire file from the server.
-> 2. a very short piece of information that represents the file.
-> 3. there are several kinds of attack based on this security vulnerability.
-
-- Proofs of Ownership (PoWs)
-**Goal**: design a solution where a client proves to the server that it indeed has the file.
-Requirements:
-> 1. **public** hash function (procedure must be public)
-> 2. the protocol run between the server and client must be **bandwidth efficient**
-> 3. Sever constraints: the solution must allow the server to store only ax extremely **short information** per file.
-> 4. Client constraints: the client can compute the proof by making **single pass** over the file.
-
-**Security Definition**:
-As long as the file has a lot of min-entropy, the attacker has only a small chance of convincing the server.
-
-- Solutions
-1. General solution: Erasure Coding + Merkle tree
-
-2. More efficient solution: using universal hashing
-
-
-
-
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. This work puts forward the notion of proof-of-ownership by which a client can prove to a server it has a copy of a file without actually sending it.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+Proofs of Ownership in Remote Storage Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ACM CCS'11 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Client-side deduplication allows an attacker to gain access to arbitrary-size files of other users based on a very small hash signatures of these file.
+To overcome it, it introduces the notion of proofs-of-ownership (PoWs), which lets a client efficiently prove to a server that that the client holds a file, rather than just some short information about it.
+
+This paper only focuses on **file-level** deduplication.
+### Proofs of Ownership
+- A New Attack
+**Root cause**: By accepting the hash value as a "proxy" for the entire file, the server allows anyone who gets the hash value to get the entire file. (via asking the recovery from the storage service)
+> 1. by learning a small piece of information about the file, namely its hash value, an attacker can get the entire file from the server.
+> 2. a very short piece of information that represents the file.
+> 3. there are several kinds of attack based on this security vulnerability.
+
+- Proofs of Ownership (PoWs)
+**Goal**: design a solution where a client proves to the server that it indeed has the file.
+Requirements:
+> 1. **public** hash function (procedure must be public)
+> 2. the protocol run between the server and client must be **bandwidth efficient**
+> 3. Sever constraints: the solution must allow the server to store only ax extremely **short information** per file.
+> 4. Client constraints: the client can compute the proof by making **single pass** over the file.
+
+**Security Definition**:
+As long as the file has a lot of min-entropy, the attacker has only a small chance of convincing the server.
+
+- Solutions
+1. General solution: Erasure Coding + Merkle tree
+
+2. More efficient solution: using universal hashing
+
+
+
+
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. This work puts forward the notion of proof-of-ownership by which a client can prove to a server it has a copy of a file without actually sending it.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/SGXExplained-IACR'16.md b/StoragePaperNote/SGXExplained-IACR'16.md
old mode 100644
new mode 100755
index a8ae4e2..58117b9
--- a/StoragePaperNote/SGXExplained-IACR'16.md
+++ b/StoragePaperNote/SGXExplained-IACR'16.md
@@ -1,127 +1,128 @@
----
-typora-copy-images-to: ../paper_figure
----
-Intel SGX Explained
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| IACR'16 | SGX |
-[TOC]
-
-## 1. Overview
-- The root problem (confidentiality and integrity)
- - Secure remote computation: executing software on a remote computer **owned and maintained by an untrusted party**.
- - leveraging trusted hardware inn the remote computer
- - relies on **software attestation**
- - prove to a **user** that he is communicating with a specific piece of software running in a secure container hosted by the trusted hardware.
- - Proof is a signature that certifies the hash of the secure container's contents.
-
-- Enclave
- - Secure container
- - contains the private data in a computation
- - the code that operates on that data
-
-## 2. SGX programming model
-### 2.1 SGX Physical Memory Organization
-- Processor Reserved Memory (PRM)
- - PRM is a subset of DRAM that cannot be directly accessed by other software.
- - CPU's integrated memory controllers also reject DMA transfers targeting the PRM.
- - 
-
-- Enclave Page Cache (EPC)
- - Since a system can have multiple enclaves on a system at the same time, the EPC is split into 4KB pages that can be assigned to different enclaves.
- - the same page size as the architecture's address translation feature.
- - the EPC is managed by the same system software that manages the rest of the computer's physical memory
- - a hypervisor or an OS kernel uses **SGX instructions** to allocate unused pages
- - Most EPC pages are initialized by copying data from a non-PRM memory page.
-
-- Enclave Page Cache Map (EPCM)
- - As the system software is not trusted, SGX processors check the correctness of the system software's allocation decisions.
- - SGX records some information about the system software's allocation decisions for each EPC page in EPCM (store in "trusted memory")
- - 
- - page type:
- - PT_REG (regular type): enclave's code and data
- - PT_SECS (SGX Enclave Control Structures): store per-enclave metadata
-
-### 2.2 The memory layout of an SGX enclave
-Describe the interaction between enclaves and non-enclave software
-
-- The Enclave Linear Address Range (ELRANGE)
- - Each enclave designates an area in its **virtual address space**, call (ELRANGE)
- - map the code and sensitive data stored in the enclave's EPC pages
- - the virtual address space outside ELRANG is mapped to access non-EPC memory via the same virtual address.
- - 
- - when an enclave represents a **dynamic library**, it is natural to set ELRANGE to the memory range reserved for the library by the loader.
-
-## 3. Enclave Programming
-- Several terms need to be clarified
- - ECALL: "Enclave Call", a call made into an interface function within the enclave
- - OCALL: "Out Call", a call made from within the enclave to the outside application
- - Trusted Thread Context: the context for a thread running inside the enclave
- - Thread control structure (TCS)
- - Thread data/thread local storage: data within the enclave and specific to the thread.
- - State save area: a data buffer which holds register state when an enclave must exit due to an interrupt or exception.
- - Stack: a stack located within the enclave
-
-- General design approach
- - identify the application's secret
- - identify the providers and consumers of those secrets
- - determine the enclave boundary
- - tailor the application components for the enclave
-
-### 3.1 Enclave definition language (EDL)
-An enclave must expose an API for untrusted code to call in (ECALLs) and express what functions provided by the untrusted code are needed (OCALLs).
-> both ECALLs and OCALLs are defined by developers using EDL, they consist the enclave boundary.
-
-- EDL syntax
- - An enclave's bridge functions
- - ECALLs are prototyped in the trusted section, and OCALLs are prototyped in the untrusted section.
- - the *Edger8r* tool that is included with the Intel SGX SDK parses the EDL file and generates a series of **proxy functions**.
- - these proxy functions are wrappers around the real functions that are prototyped in the EDL.
- - Each ECALL and OCALL gets a pair of proxy functions (a trusted half and untrusted half)
- - trusted proxy functions: $EnclaveName$\_t.h and $EnclaveName$\_t.c
- - untrusted proxy functions: $EnclaveName$\_u.h and $EnclaveName$\_u.c
-```c++
-enclave {
- // Include files
- // Import other edl files
- // Data structure declarations to be used as parameters of the function prototypes in edl
-
- trusted {
- // Include file if any. It will be inserted in the trusted header file (enclave_t.h)
- // Trusted function prototypes (ECALLs)
- };
-
- untrusted {
- // Include file if any. It will be inserted in the untrusted header file (enclave_u.h)
- // Untrusted function prototypes (OCALLs)
- };
-};
-```
-
-- The proxy functions are responsible for:
- - Marshaling data into and out of the enclave
- - Placing the return value of the real ECALL or OCALL in an address referenced by a pointer parameter
- - Returning the success of failure of the ECALL and OCALL itself as an "sgx_status_t" value.
-
-
-
-- ECALL and OCALL processes
- - Program does not call the ECALL and OCALL function **directly**.
- - make an ECALL:
- - call the untrusted proxy function for the ECALL, which in turn calls the trusted proxy function inside the enclave.
- - that proxy then calls the "real" ECALL and the return value propagates back to the untrusted function.
-
-
-
-- The process to build a enclave program
- - write the Enclave Definition Language (EDL)
- - write the application that uses the enclave
- - define the global enclave id
- - As it is in the untrusted application, we must include "sgx_urts.h"
- - include: "sgx_create_enclave" function, officially initialize the enclave instance.
- - include: "sgx_destroy_enclave()" function, destroy the targeted enclave.
- - also include "Enclave_u.h", which will include all of the ECALL proxies generated from the EDL file after compilation
- - define the functions which should be put in enclave (e.g., in "Enclave.h")
- - put those functions in the trusted section of "Enclave.edl"
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Intel SGX Explained
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| IACR'16 | SGX |
+[TOC]
+
+## 1. Overview
+
+- The root problem (confidentiality and integrity)
+ - Secure remote computation: executing software on a remote computer **owned and maintained by an untrusted party**.
+ - leveraging trusted hardware inn the remote computer
+ - relies on **software attestation**
+ - prove to a **user** that he is communicating with a specific piece of software running in a secure container hosted by the trusted hardware.
+ - Proof is a signature that certifies the hash of the secure container's contents.
+
+- Enclave
+ - Secure container
+ - contains the private data in a computation
+ - the code that operates on that data
+
+## 2. SGX programming model
+### 2.1 SGX Physical Memory Organization
+- Processor Reserved Memory (PRM)
+ - PRM is a subset of DRAM that cannot be directly accessed by other software.
+ - CPU's integrated memory controllers also reject DMA transfers targeting the PRM.
+ - 
+
+- Enclave Page Cache (EPC)
+ - Since a system can have multiple enclaves on a system at the same time, the EPC is split into 4KB pages that can be assigned to different enclaves.
+ - the same page size as the architecture's address translation feature.
+ - the EPC is managed by the same system software that manages the rest of the computer's physical memory
+ - a hypervisor or an OS kernel uses **SGX instructions** to allocate unused pages
+ - Most EPC pages are initialized by copying data from a non-PRM memory page.
+
+- Enclave Page Cache Map (EPCM)
+ - As the system software is not trusted, SGX processors check the correctness of the system software's allocation decisions.
+ - SGX records some information about the system software's allocation decisions for each EPC page in EPCM (store in "trusted memory")
+ - 
+ - page type:
+ - PT_REG (regular type): enclave's code and data
+ - PT_SECS (SGX Enclave Control Structures): store per-enclave metadata
+
+### 2.2 The memory layout of an SGX enclave
+Describe the interaction between enclaves and non-enclave software
+
+- The Enclave Linear Address Range (ELRANGE)
+ - Each enclave designates an area in its **virtual address space**, call (ELRANGE)
+ - map the code and sensitive data stored in the enclave's EPC pages
+ - the virtual address space outside ELRANG is mapped to access non-EPC memory via the same virtual address.
+ - 
+ - when an enclave represents a **dynamic library**, it is natural to set ELRANGE to the memory range reserved for the library by the loader.
+
+## 3. Enclave Programming
+- Several terms need to be clarified
+ - ECALL: "Enclave Call", a call made into an interface function within the enclave
+ - OCALL: "Out Call", a call made from within the enclave to the outside application
+ - Trusted Thread Context: the context for a thread running inside the enclave
+ - Thread control structure (TCS)
+ - Thread data/thread local storage: data within the enclave and specific to the thread.
+ - State save area: a data buffer which holds register state when an enclave must exit due to an interrupt or exception.
+ - Stack: a stack located within the enclave
+
+- General design approach
+ - identify the application's secret
+ - identify the providers and consumers of those secrets
+ - determine the enclave boundary
+ - tailor the application components for the enclave
+
+### 3.1 Enclave definition language (EDL)
+An enclave must expose an API for untrusted code to call in (ECALLs) and express what functions provided by the untrusted code are needed (OCALLs).
+> both ECALLs and OCALLs are defined by developers using EDL, they consist the enclave boundary.
+
+- EDL syntax
+ - An enclave's bridge functions
+ - ECALLs are prototyped in the trusted section, and OCALLs are prototyped in the untrusted section.
+ - the *Edger8r* tool that is included with the Intel SGX SDK parses the EDL file and generates a series of **proxy functions**.
+ - these proxy functions are wrappers around the real functions that are prototyped in the EDL.
+ - Each ECALL and OCALL gets a pair of proxy functions (a trusted half and untrusted half)
+ - trusted proxy functions: $EnclaveName$\_t.h and $EnclaveName$\_t.c
+ - untrusted proxy functions: $EnclaveName$\_u.h and $EnclaveName$\_u.c
+```c++
+enclave {
+ // Include files
+ // Import other edl files
+ // Data structure declarations to be used as parameters of the function prototypes in edl
+
+ trusted {
+ // Include file if any. It will be inserted in the trusted header file (enclave_t.h)
+ // Trusted function prototypes (ECALLs)
+ };
+
+ untrusted {
+ // Include file if any. It will be inserted in the untrusted header file (enclave_u.h)
+ // Untrusted function prototypes (OCALLs)
+ };
+};
+```
+
+- The proxy functions are responsible for:
+ - Marshaling data into and out of the enclave
+ - Placing the return value of the real ECALL or OCALL in an address referenced by a pointer parameter
+ - Returning the success of failure of the ECALL and OCALL itself as an "sgx_status_t" value.
+
+
+
+- ECALL and OCALL processes
+ - Program does not call the ECALL and OCALL function **directly**.
+ - make an ECALL:
+ - call the untrusted proxy function for the ECALL, which in turn calls the trusted proxy function inside the enclave.
+ - that proxy then calls the "real" ECALL and the return value propagates back to the untrusted function.
+
+
+
+- The process to build a enclave program
+ - write the Enclave Definition Language (EDL)
+ - write the application that uses the enclave
+ - define the global enclave id
+ - As it is in the untrusted application, we must include "sgx_urts.h"
+ - include: "sgx_create_enclave" function, officially initialize the enclave instance.
+ - include: "sgx_destroy_enclave()" function, destroy the targeted enclave.
+ - also include "Enclave_u.h", which will include all of the ECALL proxies generated from the EDL file after compilation
+ - define the functions which should be put in enclave (e.g., in "Enclave.h")
+ - put those functions in the trusted section of "Enclave.edl"
+
diff --git a/StoragePaperNote/SecDep-MSST'15.md b/StoragePaperNote/SecDep-MSST'15.md
old mode 100644
new mode 100755
index 929d6e0..40937b8
--- a/StoragePaperNote/SecDep-MSST'15.md
+++ b/StoragePaperNote/SecDep-MSST'15.md
@@ -1,113 +1,113 @@
----
-typora-copy-images-to: ../paper_figure
----
-SecDep: A User-Aware Efficient Fine-Grained Secure Deduplication Scheme with Multi-Level Key Management
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| MSST'15 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Convergent encryption suffers brute-force attacks or incurs large computation overheads.
-> 1. deterministic and keyless issues.
-> 2. for chunk-level deduplication: each chunk performs the time-consuming RSA-OPRF protocol to generate message-locked key. (incur significant computation overhead)
-
-Existing key management approaches of convergent encryption incur
-> 1. large key space overhead
-> 2. single-point-of-failure
-
-This paper proposes the SecDeup which exploits redundant data distribution among and inside users, and use variants of CE to make a trade-off between **data security and duplicate detection performance**.
-
-**Key Observation**
-1. Cross-user redundant data are mainly from the duplicate files. Inside-user redundant data are mainly from the duplicate chunks.
-2. Cross-user level deduplication and inside-user deduplication schemes face different security challenges.
-> Cross-user level deduplication: high overhead.
-
-### SecDep
-- Key Idea
-Instead of using global chunk-level deduplication, SecDep combines cross-user file-level and inside-user chunk-level secure deduplication to eliminate more redundant data.
-> file-level deduplication: server-aided hash convergent encryption
-> chunk-level deduplication: user-aided convergent encryption
-> cross-user file-level and inside-user chunk-level deduplication
-
-- System Model
-
-
-
-- Threat Model
-Mainly focus on internal adversaries which can
-> 1. compromising the SP
-> 2. colluding with users
-> 3. stealing data from the DKS
-
-Security Goal
-> 1. data confidentiality: the data copies are secure to
-> 2. security of keys:
-
-- User-Aware Convergent Encryption (UACE)
-
-
-- Multi-Level Key Management
-It includes three kinds of keys: file-level keys, chunk-level keys, share-level keys.
-
-
-
-- Security analysis
-Ensure **data confidentiality** and **key security**. Consider two types of adversaries:
-> internal adversary (mainly focus on):
-> external adversary: could resist the external attacks by authentication.
-
-1. security of data
-> including data confidentiality and integrity
-
-2. security of keys
-> distribute keys in several key servers by using the known SSSS (secret sharing).
-
-3. security of SecDep
-**Best case**: the adversary only compromise the SP, but cannot access to DKS
-> the adversary cannot know the content of other users data even if it can perform brute-force attacks.
-
-**Semi-best case**: the adversary has compromisd, and can access to DKS
-> SecDep can ensure data security, the adversary cannot break the encryption key due to not knowing the user's secret.
-
-**Worst casr**: the adversary obtains some users' secrets
-> SecDep can still ensure security for unpredictable data that are not falling into a known set.
-
-### Implementation and Evaluation
-- Evaluation
-Compare with:
-> Baseline: without any security mechanisms.
-> DupLESS-chunk: chunk-level DupLESS
-> DupLESS-file: file-level DupLESS
-> SecDep
-
-Dataset:
-> 1. synthetic dataset:
-> 2. real-word datasets:
-
-1. Sensitivity study on size & number of files
-2. Sensitivity study on average chunk size
-3.
-
-## 2. Strength (Contributions of the paper)
-1. A modified version of CE: user-aware convergent encryption (UACE) approach to resist brute-force attack and reduce time overhead.
-2. Multi-level key management (MLK): ensure key security and reduce key space overheads.
-
-3. This paper considers the issue of the key space overhead
-> rarely to consider in other papers
-
-## 3. Weakness (Limitations of the paper)
-1. This paper intends to only eliminate duplicate files across different users, but it can only obtain the storage saving of majority of duplicate data.
-> incur the loss of dedup factor compared with (2.8-7.35%)
-
-
-## 4. Future Works
-1. This paper gives a good summary of state-of-art secure deduplication schemes.
-
-2. RSA-OPRF protocol is time-consuming
-> will incur huge computation and time overheads for chunk-level deduplication.
-> Large number of chunks incur hugh computation overheads
-
+---
+typora-copy-images-to: ../paper_figure
+---
+SecDep: A User-Aware Efficient Fine-Grained Secure Deduplication Scheme with Multi-Level Key Management
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| MSST'15 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Convergent encryption suffers brute-force attacks or incurs large computation overheads.
+> 1. deterministic and keyless issues.
+> 2. for chunk-level deduplication: each chunk performs the time-consuming RSA-OPRF protocol to generate message-locked key. (incur significant computation overhead)
+
+Existing key management approaches of convergent encryption incur
+> 1. large key space overhead
+> 2. single-point-of-failure
+
+This paper proposes the SecDeup which exploits redundant data distribution among and inside users, and use variants of CE to make a trade-off between **data security and duplicate detection performance**.
+
+**Key Observation**
+1. Cross-user redundant data are mainly from the duplicate files. Inside-user redundant data are mainly from the duplicate chunks.
+2. Cross-user level deduplication and inside-user deduplication schemes face different security challenges.
+> Cross-user level deduplication: high overhead.
+
+### SecDep
+- Key Idea
+Instead of using global chunk-level deduplication, SecDep combines cross-user file-level and inside-user chunk-level secure deduplication to eliminate more redundant data.
+> file-level deduplication: server-aided hash convergent encryption
+> chunk-level deduplication: user-aided convergent encryption
+> cross-user file-level and inside-user chunk-level deduplication
+
+- System Model
+
+
+
+- Threat Model
+Mainly focus on internal adversaries which can
+> 1. compromising the SP
+> 2. colluding with users
+> 3. stealing data from the DKS
+
+Security Goal
+> 1. data confidentiality: the data copies are secure to
+> 2. security of keys:
+
+- User-Aware Convergent Encryption (UACE)
+
+
+- Multi-Level Key Management
+It includes three kinds of keys: file-level keys, chunk-level keys, share-level keys.
+
+
+
+- Security analysis
+Ensure **data confidentiality** and **key security**. Consider two types of adversaries:
+> internal adversary (mainly focus on):
+> external adversary: could resist the external attacks by authentication.
+
+1. security of data
+> including data confidentiality and integrity
+
+2. security of keys
+> distribute keys in several key servers by using the known SSSS (secret sharing).
+
+3. security of SecDep
+**Best case**: the adversary only compromise the SP, but cannot access to DKS
+> the adversary cannot know the content of other users data even if it can perform brute-force attacks.
+
+**Semi-best case**: the adversary has compromisd, and can access to DKS
+> SecDep can ensure data security, the adversary cannot break the encryption key due to not knowing the user's secret.
+
+**Worst casr**: the adversary obtains some users' secrets
+> SecDep can still ensure security for unpredictable data that are not falling into a known set.
+
+### Implementation and Evaluation
+- Evaluation
+Compare with:
+> Baseline: without any security mechanisms.
+> DupLESS-chunk: chunk-level DupLESS
+> DupLESS-file: file-level DupLESS
+> SecDep
+
+Dataset:
+> 1. synthetic dataset:
+> 2. real-word datasets:
+
+1. Sensitivity study on size & number of files
+2. Sensitivity study on average chunk size
+3.
+
+## 2. Strength (Contributions of the paper)
+1. A modified version of CE: user-aware convergent encryption (UACE) approach to resist brute-force attack and reduce time overhead.
+2. Multi-level key management (MLK): ensure key security and reduce key space overheads.
+
+3. This paper considers the issue of the key space overhead
+> rarely to consider in other papers
+
+## 3. Weakness (Limitations of the paper)
+1. This paper intends to only eliminate duplicate files across different users, but it can only obtain the storage saving of majority of duplicate data.
+> incur the loss of dedup factor compared with (2.8-7.35%)
+
+
+## 4. Future Works
+1. This paper gives a good summary of state-of-art secure deduplication schemes.
+
+2. RSA-OPRF protocol is time-consuming
+> will incur huge computation and time overheads for chunk-level deduplication.
+> Large number of chunks incur hugh computation overheads
+
3. This paper argues suggests that cross-user redundant data are mainly from duplicate files.
\ No newline at end of file
diff --git a/StoragePaperNote/Security/CompareByHash-ATC'06.md b/StoragePaperNote/Security/CompareByHash-ATC'06.md
old mode 100644
new mode 100755
index 35fa40c..5f70f21
--- a/StoragePaperNote/Security/CompareByHash-ATC'06.md
+++ b/StoragePaperNote/Security/CompareByHash-ATC'06.md
@@ -1,67 +1,67 @@
----
-typora-copy-images-to: ../paper_figure
----
-Compare-by-Hash: A Reasoned Analysis
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ATC'06 | Hash |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper investigates the digest of a cryptographic hash function is equal on two distinct files.
-> instead of comparing them byte-by-byte.
-> can quickly check the files for equality by once again comparing only a few bytes rather than reading through them byte-by-byte.
-
-In rsync and LBFS, the goal of the designers was not to provide **security** via the use of cryptographic hash functions.
-
-This paper intends to against the work of Henson in HotOS'03, and suggests that it is certainly fine to use a 160-bit hash function like SHA1 with compare-by-hash.
-
-
-### Method Name
-- The basic of hash functions
-1. collision-resistant
-2. inversion-resistant
-3. second-preimage-resistant
-
-In the context of compare-by-hash, collision resistant is the main concern.
-> if it is given a list of $b$-bit independent random strings, it is expected to see the collision after about $2^{\frac{b}{2}}$. (**birthday bound**)
-> For example, for MD5, there should begin showing collisions after $2^{64}$ hash values.
-
-- The central theme in Henson's paper
-Her opposition to the notion that digests can be treated as unique ids for blocks of data.
-
-- Key property
-> Most file-system data are in fact **not uniform** over any domain.
-> SHA-1 does a very good job at mapping corelated inputs to uncorrelated outputs. (**differential cryptanalysis**).
-
-
-- The probability of collision
-
-- Attack model
-1. Bad luck attacker:
-The chance that two files will have the same hash is about $2^{-160}$ for a 160-bit hash function like SHA-1.
-
-2. An intelligent attacker
-try to cause collision, may substitute the blocks with same hash value.
-
-Compare-by-hash holds up well in both attack models.
-> In typical compare-by-hash setting there is no adversary
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. Two practical applications of compare-by-hash
-> rsync, Low-Bandwidth File System (LBFS)
-
-2. The security of SHA-1
-> this paper estimates it would cost 80,000,000 USD and 2 years to find a collision in SHA-1
-
-3. Cryptographic hash function in digitial signature
-> a document being digitally signed is first hashed with a cryptographic hash function, and the signature is applied to the hash value
-> applying a computationlly-expensive digital signature to a large file would be prohibitive.
+---
+typora-copy-images-to: ../paper_figure
+---
+Compare-by-Hash: A Reasoned Analysis
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ATC'06 | Hash |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper investigates the digest of a cryptographic hash function is equal on two distinct files.
+> instead of comparing them byte-by-byte.
+> can quickly check the files for equality by once again comparing only a few bytes rather than reading through them byte-by-byte.
+
+In rsync and LBFS, the goal of the designers was not to provide **security** via the use of cryptographic hash functions.
+
+This paper intends to against the work of Henson in HotOS'03, and suggests that it is certainly fine to use a 160-bit hash function like SHA1 with compare-by-hash.
+
+
+### Method Name
+- The basic of hash functions
+1. collision-resistant
+2. inversion-resistant
+3. second-preimage-resistant
+
+In the context of compare-by-hash, collision resistant is the main concern.
+> if it is given a list of $b$-bit independent random strings, it is expected to see the collision after about $2^{\frac{b}{2}}$. (**birthday bound**)
+> For example, for MD5, there should begin showing collisions after $2^{64}$ hash values.
+
+- The central theme in Henson's paper
+Her opposition to the notion that digests can be treated as unique ids for blocks of data.
+
+- Key property
+> Most file-system data are in fact **not uniform** over any domain.
+> SHA-1 does a very good job at mapping corelated inputs to uncorrelated outputs. (**differential cryptanalysis**).
+
+
+- The probability of collision
+
+- Attack model
+1. Bad luck attacker:
+The chance that two files will have the same hash is about $2^{-160}$ for a 160-bit hash function like SHA-1.
+
+2. An intelligent attacker
+try to cause collision, may substitute the blocks with same hash value.
+
+Compare-by-hash holds up well in both attack models.
+> In typical compare-by-hash setting there is no adversary
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. Two practical applications of compare-by-hash
+> rsync, Low-Bandwidth File System (LBFS)
+
+2. The security of SHA-1
+> this paper estimates it would cost 80,000,000 USD and 2 years to find a collision in SHA-1
+
+3. Cryptographic hash function in digitial signature
+> a document being digitally signed is first hashed with a cryptographic hash function, and the signature is applied to the hash value
+> applying a computationlly-expensive digital signature to a large file would be prohibitive.
diff --git a/StoragePaperNote/Security/DeRef-TrustCom'11.md b/StoragePaperNote/Security/DeRef-TrustCom'11.md
old mode 100644
new mode 100755
index 86945b9..0531290
--- a/StoragePaperNote/Security/DeRef-TrustCom'11.md
+++ b/StoragePaperNote/Security/DeRef-TrustCom'11.md
@@ -1,99 +1,99 @@
----
-typora-copy-images-to: ../paper_figure
----
-A Privacy-Preserving Defense Mechanism Against Request Forgery Attacks
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| TrustCom'11 | Network Security |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-This paper intends to defend against a general class of cross-site and same-site request forgery attacks.
-> an attacker's website triggers a client's browser to send an HTTP request to a target website.
-> If the HTTP request carries the client credentials, then the attacker can perform actions on the website using the client's privileges, without the client be notified.
-
-- The limitation of current countermeasure
-A website can configure the scopes that are legitimate to initiate or receive sensitive requests that contain authentication credentials.
-> scopes: a combination of the protocol, domain, and path.
-
-the shortcoming of existing fine-grained access control approaches is *the policy file carries sensitive scope information* in **plain format**.
-
-### DeRef
-
-- Goal
-Not only protect a browser from revealing the URLs from which it initiates requests, but also protect a website from revealing how it configures the legitimate scope
-> the browser and website to exchange sensitive scope information while they may not need to fully trust each other.
-
-1. Detecting forged requests
-2. Fine-grained access control
-3. Privacy-preserving checking
-4. Feasible deployment
-
-
-- Key idea
-DeRef uses two-phase checking to make a trade-off between performance and privacy protection in real deployment.
-
-- Two-phase Privacy-Preserving checking (hash checking, blind checking)
-Allow the *browser* and *the website* to exchange information in a privacy-preserving manner.
-For example, suppose the website configures $L$ legitimate scopes in an ACL
-> denoted by $x_i$, where $i=1,2,...,L$.
-
-If the browser initiates a request to the website from URL $y$, then it checks if $y$ belongs to any of the $x_i$
-> the browser derives **all possible** scopes for a given URL $y$ into $y_1, y_2, ...,y_m$
-
-**Requirements**:
-1. the browser does not reveal $y$ to the website
-2. the browser does not know the $x_i$'s configured by the website, unless a scope of $y$ matches any of these.
-
-For (1):
-the website send the browser a list of $k$-bit hashes of the configured scopes,
-> $h(s,x_1)...h(s, x_L)$, $s$ is a random salt that is sent alongside the hash list.
-
-The browser also initiates a request from URL $y$. it computes $h(s, y_i), i \in [1, m]$ and checks if it matches any $h(s, x_i)$
-> does not reveal $y$ to the website.
-
-if $k$ is small, then the browser cannot surely tell if a $x_i$ is being configured.
-
-For (2):
-Use the potentially matched scopes returned by hash checking as inputs, and conduct blind checking
-> follow the blind-RSA, and send the blinded hash of $y_i$ to the website
-> the website signs and returns the hash of blinded hash
-
-- Rationale
-In blind checking, the browser needs to take a round trip to send every potentially matched scope to the website and have the website sign the scope.
-> high computation overhead.
-
-It introduces hash checking to ignore any scopes that are guaranteed to be not configured
-> reduce the overhead of blind checking.
-
-- Putting all things together
-1. Start-up: get the same base URL.
-2. Downloading the policy file
-3. Checking process: two-phase checking
-
-
-### Implementation and Evaluation
-- Evaluation
-1. Browsing insensitive webpage
-2. Browsing sensitive webpage
-3. Browsing malicious webpage
-4. Trade-off between performance and privacy
-> tune the parameter $k$.
-
-## 2. Strength (Contributions of the paper)
-1. propose a practical privacy-preserving approach to defending against cross-site and same-site request forgery attacks.
-> allow the browser and the website to exchange configuration information in a *privacy-preserving* manner.
-
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-1. In this paper, it implements a fine-grained access control mechanism by storing the policy-based information in the *Bloom filter*.
-
-2. In this paper, it needs to create privacy-preserving lists
-> The website should keep the ACLs private to browsers to avoid revealing its defense strategy.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+A Privacy-Preserving Defense Mechanism Against Request Forgery Attacks
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| TrustCom'11 | Network Security |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+This paper intends to defend against a general class of cross-site and same-site request forgery attacks.
+> an attacker's website triggers a client's browser to send an HTTP request to a target website.
+> If the HTTP request carries the client credentials, then the attacker can perform actions on the website using the client's privileges, without the client be notified.
+
+- The limitation of current countermeasure
+A website can configure the scopes that are legitimate to initiate or receive sensitive requests that contain authentication credentials.
+> scopes: a combination of the protocol, domain, and path.
+
+the shortcoming of existing fine-grained access control approaches is *the policy file carries sensitive scope information* in **plain format**.
+
+### DeRef
+
+- Goal
+Not only protect a browser from revealing the URLs from which it initiates requests, but also protect a website from revealing how it configures the legitimate scope
+> the browser and website to exchange sensitive scope information while they may not need to fully trust each other.
+
+1. Detecting forged requests
+2. Fine-grained access control
+3. Privacy-preserving checking
+4. Feasible deployment
+
+
+- Key idea
+DeRef uses two-phase checking to make a trade-off between performance and privacy protection in real deployment.
+
+- Two-phase Privacy-Preserving checking (hash checking, blind checking)
+Allow the *browser* and *the website* to exchange information in a privacy-preserving manner.
+For example, suppose the website configures $L$ legitimate scopes in an ACL
+> denoted by $x_i$, where $i=1,2,...,L$.
+
+If the browser initiates a request to the website from URL $y$, then it checks if $y$ belongs to any of the $x_i$
+> the browser derives **all possible** scopes for a given URL $y$ into $y_1, y_2, ...,y_m$
+
+**Requirements**:
+1. the browser does not reveal $y$ to the website
+2. the browser does not know the $x_i$'s configured by the website, unless a scope of $y$ matches any of these.
+
+For (1):
+the website send the browser a list of $k$-bit hashes of the configured scopes,
+> $h(s,x_1)...h(s, x_L)$, $s$ is a random salt that is sent alongside the hash list.
+
+The browser also initiates a request from URL $y$. it computes $h(s, y_i), i \in [1, m]$ and checks if it matches any $h(s, x_i)$
+> does not reveal $y$ to the website.
+
+if $k$ is small, then the browser cannot surely tell if a $x_i$ is being configured.
+
+For (2):
+Use the potentially matched scopes returned by hash checking as inputs, and conduct blind checking
+> follow the blind-RSA, and send the blinded hash of $y_i$ to the website
+> the website signs and returns the hash of blinded hash
+
+- Rationale
+In blind checking, the browser needs to take a round trip to send every potentially matched scope to the website and have the website sign the scope.
+> high computation overhead.
+
+It introduces hash checking to ignore any scopes that are guaranteed to be not configured
+> reduce the overhead of blind checking.
+
+- Putting all things together
+1. Start-up: get the same base URL.
+2. Downloading the policy file
+3. Checking process: two-phase checking
+
+
+### Implementation and Evaluation
+- Evaluation
+1. Browsing insensitive webpage
+2. Browsing sensitive webpage
+3. Browsing malicious webpage
+4. Trade-off between performance and privacy
+> tune the parameter $k$.
+
+## 2. Strength (Contributions of the paper)
+1. propose a practical privacy-preserving approach to defending against cross-site and same-site request forgery attacks.
+> allow the browser and the website to exchange configuration information in a *privacy-preserving* manner.
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. In this paper, it implements a fine-grained access control mechanism by storing the policy-based information in the *Bloom filter*.
+
+2. In this paper, it needs to create privacy-preserving lists
+> The website should keep the ACLs private to browsers to avoid revealing its defense strategy.
+
3. In its two-phase checking, it uses hashing checking to reduce the overhead of blind hash checking which can be used in other issues.
\ No newline at end of file
diff --git a/StoragePaperNote/Security/DifferentialPrivacy-INFOCOM'18.md b/StoragePaperNote/Security/DifferentialPrivacy-INFOCOM'18.md
old mode 100644
new mode 100755
index d5bf6c6..5a87a0c
--- a/StoragePaperNote/Security/DifferentialPrivacy-INFOCOM'18.md
+++ b/StoragePaperNote/Security/DifferentialPrivacy-INFOCOM'18.md
@@ -1,80 +1,80 @@
----
-typora-copy-images-to: paper_figure
----
-Differentially Private Access Patterns for Searchable Symmetric Encryption
-------------------------------------------
-| Venue | Category |
-| :--------: | :------------------: |
-| INFOCOM'18 | Differential Privacy |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper proposes to protect searchable symmetric encryption (SSE) against access-pattern leakage with a form of access-pattern obfuscation (APO).
-It is a framework which can built over any given SSE scheme.
-
-A basic workflow of SEE:
-
-
-This work aims to defend query recovery attacks:
-The goal of a query recovery attack is to recover the content of a query (i.e., the plaintext keyword) issued by the client to server.
-
-### Access-Pattern Obfuscation
-Design goal: provide an access-pattern obfuscation mechanism that is compatible with existing SSE schemes.
-> add false positives and false negatives to the search results.
-> **false positive**: the server returns some documents that do not match the query (fake documents).
-> **false negatives**: the server does not return some documents that match the query (violate the correctness of the SEE scheme)
-
-- How to handle correctness issue of false negatives
-This paper introduce redundancy to the document to collection using **erasure codes**.
-> each document $\rightarrow$ multiple shards
-> the collection of all shards is encrypted and outsourced to the remote server.
-
-It argues allowing a probability that some matching documents may not be returned is essential to archive a provable privacy guarantee such differential privacy, and a fairly high recall rate, say 99.99%, can be useful in practice already.
-
-- Setup Phase
-1. For each document $D$, it extracts its keyword list $W$.
-2. Applying erasure coding to generate $m$ shares of $D$, append the keyword list for each share.
-3. For each share and its keyword list, adopt the access-pattern obfuscation mechanism
-> some shares of the matching documents will not be returned in response to a query
-
-- Search Phase
-1. Given a search keyword $w$, the resulting search token is sent to the server.
-2. The received shares are decrypted.
-
-- $d$-Private Access-Pattern Obfuscation
-The goal of this privacy definition is to guarantee that for access patterns that are **similar**, the obfuscated access patterns generated from them are **indistinguishable**.
-> $x=111000$, $x^{'}=110100$, obfuscated access pattern $y=110001$ is observed with high probability, the adversary cannot tell whether $x$ or $x^{'}$ was the original access pattern.
-
-An access-pattern obfuscation mechanism probabilistically convert an access-pattern vector to another access-pattern vector.
-> the intuition of $d$-privacy is that similar access patterns generate similar obfuscated access patterns.
-> So from an obfuscated access pattern, it is different to infer its original access pattern.
-
-
-
-
-### Implementation and Evaluation
-- Security Evaluation
-1. The baseline IKK attack
-> the adversary's unawareness of the existence of mitigation methods.
-
-2. The improved IKK attack
-> further assume the adversary knows the existence of its defenses.
-> This is possible when the shards of different original documents are **different in size**. (This point is very important)
-
-- Performance Evaluation
-1. Storage and Communication Overhead
-2. Precision
-3. Runtime Overhead: access latency
-
-## 2. Strength (Contributions of the paper)
-1. It proposes d-privacy for access pattern of general SSE schemes
-2. It designs a d-private access-pattern obfuscation mechanism which is compatible with existing SSE schemes
-3. It also implement a prototype of the proposed obfuscation mechanism.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. This paper provides a good insight on how to leverage erasure coding to achieve differential privacy, and this can be used in other scenarios. How to use in secure deduplication?
-
+---
+typora-copy-images-to: paper_figure
+---
+Differentially Private Access Patterns for Searchable Symmetric Encryption
+------------------------------------------
+| Venue | Category |
+| :--------: | :------------------: |
+| INFOCOM'18 | Differential Privacy |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper proposes to protect searchable symmetric encryption (SSE) against access-pattern leakage with a form of access-pattern obfuscation (APO).
+It is a framework which can built over any given SSE scheme.
+
+A basic workflow of SEE:
+
+
+This work aims to defend query recovery attacks:
+The goal of a query recovery attack is to recover the content of a query (i.e., the plaintext keyword) issued by the client to server.
+
+### Access-Pattern Obfuscation
+Design goal: provide an access-pattern obfuscation mechanism that is compatible with existing SSE schemes.
+> add false positives and false negatives to the search results.
+> **false positive**: the server returns some documents that do not match the query (fake documents).
+> **false negatives**: the server does not return some documents that match the query (violate the correctness of the SEE scheme)
+
+- How to handle correctness issue of false negatives
+This paper introduce redundancy to the document to collection using **erasure codes**.
+> each document $\rightarrow$ multiple shards
+> the collection of all shards is encrypted and outsourced to the remote server.
+
+It argues allowing a probability that some matching documents may not be returned is essential to archive a provable privacy guarantee such differential privacy, and a fairly high recall rate, say 99.99%, can be useful in practice already.
+
+- Setup Phase
+1. For each document $D$, it extracts its keyword list $W$.
+2. Applying erasure coding to generate $m$ shares of $D$, append the keyword list for each share.
+3. For each share and its keyword list, adopt the access-pattern obfuscation mechanism
+> some shares of the matching documents will not be returned in response to a query
+
+- Search Phase
+1. Given a search keyword $w$, the resulting search token is sent to the server.
+2. The received shares are decrypted.
+
+- $d$-Private Access-Pattern Obfuscation
+The goal of this privacy definition is to guarantee that for access patterns that are **similar**, the obfuscated access patterns generated from them are **indistinguishable**.
+> $x=111000$, $x^{'}=110100$, obfuscated access pattern $y=110001$ is observed with high probability, the adversary cannot tell whether $x$ or $x^{'}$ was the original access pattern.
+
+An access-pattern obfuscation mechanism probabilistically convert an access-pattern vector to another access-pattern vector.
+> the intuition of $d$-privacy is that similar access patterns generate similar obfuscated access patterns.
+> So from an obfuscated access pattern, it is different to infer its original access pattern.
+
+
+
+
+### Implementation and Evaluation
+- Security Evaluation
+1. The baseline IKK attack
+> the adversary's unawareness of the existence of mitigation methods.
+
+2. The improved IKK attack
+> further assume the adversary knows the existence of its defenses.
+> This is possible when the shards of different original documents are **different in size**. (This point is very important)
+
+- Performance Evaluation
+1. Storage and Communication Overhead
+2. Precision
+3. Runtime Overhead: access latency
+
+## 2. Strength (Contributions of the paper)
+1. It proposes d-privacy for access pattern of general SSE schemes
+2. It designs a d-private access-pattern obfuscation mechanism which is compatible with existing SSE schemes
+3. It also implement a prototype of the proposed obfuscation mechanism.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. This paper provides a good insight on how to leverage erasure coding to achieve differential privacy, and this can be used in other scenarios. How to use in secure deduplication?
+
> Mainly used to hide access-pattern
\ No newline at end of file
diff --git a/StoragePaperNote/Security/FADE-TDSC'12.md b/StoragePaperNote/Security/FADE-TDSC'12.md
old mode 100644
new mode 100755
index de19372..6252a01
--- a/StoragePaperNote/Security/FADE-TDSC'12.md
+++ b/StoragePaperNote/Security/FADE-TDSC'12.md
@@ -1,89 +1,89 @@
----
-typora-copy-images-to: ../paper_figure
----
-Secure Overlay Cloud Storage with Access Control and Assured Deletion
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| TDSC'12 | assured deletion |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-1. **Access control**: ensure only authorized parties can access the outsourced data on the cloud.
-2. **Assured deletion**: outsourced data is permanently inaccessible to anybody (including the data owner) upon requests of deletion of data.
-> achieving assured deletion is that it has to trust cloud storage providers to actually delete data, but they may be reluctant in doing so.
-
-
-Need to build a system that can enforce access control and assured deletion of outsourced data on the cloud in a *fine-grained manner*.
-
-### FADE
-
-- General policy-based deletion
-generalize time-based deletion to policy-based deletion
-> associate each file with a single atomic file access policy (**Boolean combinatrion of atomic policies**)
-> each policy has a control key
-
-- FADE overview
-1. Key
-> Data key: for AES encryption for data content
-> Control key: a public-private key pair, use to *encrypt/decrypt* the data keys (maintained by the quorum of key managers)
-> Access key: the private access is maintained by the FADE client (ABE encryption)
-
-2. Security goals (for active files and deleted files)
-> policy-based access control
-> policy-based assured deletion
-
-3. Entities
-FADE client, Key managers, cloud provider (thin-cloud interfaces)
-
-
-- Extensions of FADE
-1. Add the access control with ABE
-Using **attribute-based encryption (ABE)** to achieve access control
-> the client needs to satisfy the policy combination
-> the key manager leverages the **public access key** to encrypt the response messages returned to the clients
-
-In this extended version, FADE now uses two independent keys for each policy. (access control and assured deletion)
-
-2. Multiple Key Managers
-To avoid the single-point-of-failure problem, it uses Shamir's scheme
-> M < N
-> when deletion, it needs to remove (N-M+1) private control keys corresponding to the policy.
-
-
-- Security analysis
-1. Active files:
-An active file is encrypted with a data key (can only be decrypted by key manager).
-The response from key manager is protected by ABE-base access key.
-
-2. Deleted files:
-The key manager has purged the *control key* for the revoked policy permanently, the adversary loses the ability to decrypt the data key.
-
-
-### Implementation and Evaluation
-- Implementation
-1. using OpenSSL for cryptograpghic operations
-2. using *cpabe* library for the ABE-based access control
-3. *ssss* library for secret sharing
-4. *LibAWS++* for interfacing with Amazon S3 uisng plain HTTP
-5. Each file has a own metadata
-> contains the specification of the Boolean combination of policies.
-> the corresponding cryptographic keys (encrypted data key).
-> the control keys associated with the policies.
-
-- Evaluation (performance overhead and monetary overhead of FADE)
-1. file transmission time, metadata transmission time, cryptographic operation time
-
-
-## 2. Strength (Contributions of the paper)
-1. propose a general policy-based file assured deletion scheme, and fault-tolerant key management
-2. implement the prototype over Amazon S3
-
-## 3. Weakness (Limitations of the paper)
-1. it is not clear how to implement policy in the prototype implementation
-
-## 4. Some Insights (Future work)
-1. In this paper, it introduces the idea of assured deletion which is using a control key to encrypt data key
-> this idea can be used in other scenario when we consider assured deletion.
+---
+typora-copy-images-to: ../paper_figure
+---
+Secure Overlay Cloud Storage with Access Control and Assured Deletion
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| TDSC'12 | assured deletion |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+1. **Access control**: ensure only authorized parties can access the outsourced data on the cloud.
+2. **Assured deletion**: outsourced data is permanently inaccessible to anybody (including the data owner) upon requests of deletion of data.
+> achieving assured deletion is that it has to trust cloud storage providers to actually delete data, but they may be reluctant in doing so.
+
+
+Need to build a system that can enforce access control and assured deletion of outsourced data on the cloud in a *fine-grained manner*.
+
+### FADE
+
+- General policy-based deletion
+generalize time-based deletion to policy-based deletion
+> associate each file with a single atomic file access policy (**Boolean combinatrion of atomic policies**)
+> each policy has a control key
+
+- FADE overview
+1. Key
+> Data key: for AES encryption for data content
+> Control key: a public-private key pair, use to *encrypt/decrypt* the data keys (maintained by the quorum of key managers)
+> Access key: the private access is maintained by the FADE client (ABE encryption)
+
+2. Security goals (for active files and deleted files)
+> policy-based access control
+> policy-based assured deletion
+
+3. Entities
+FADE client, Key managers, cloud provider (thin-cloud interfaces)
+
+
+- Extensions of FADE
+1. Add the access control with ABE
+Using **attribute-based encryption (ABE)** to achieve access control
+> the client needs to satisfy the policy combination
+> the key manager leverages the **public access key** to encrypt the response messages returned to the clients
+
+In this extended version, FADE now uses two independent keys for each policy. (access control and assured deletion)
+
+2. Multiple Key Managers
+To avoid the single-point-of-failure problem, it uses Shamir's scheme
+> M < N
+> when deletion, it needs to remove (N-M+1) private control keys corresponding to the policy.
+
+
+- Security analysis
+1. Active files:
+An active file is encrypted with a data key (can only be decrypted by key manager).
+The response from key manager is protected by ABE-base access key.
+
+2. Deleted files:
+The key manager has purged the *control key* for the revoked policy permanently, the adversary loses the ability to decrypt the data key.
+
+
+### Implementation and Evaluation
+- Implementation
+1. using OpenSSL for cryptograpghic operations
+2. using *cpabe* library for the ABE-based access control
+3. *ssss* library for secret sharing
+4. *LibAWS++* for interfacing with Amazon S3 uisng plain HTTP
+5. Each file has a own metadata
+> contains the specification of the Boolean combination of policies.
+> the corresponding cryptographic keys (encrypted data key).
+> the control keys associated with the policies.
+
+- Evaluation (performance overhead and monetary overhead of FADE)
+1. file transmission time, metadata transmission time, cryptographic operation time
+
+
+## 2. Strength (Contributions of the paper)
+1. propose a general policy-based file assured deletion scheme, and fault-tolerant key management
+2. implement the prototype over Amazon S3
+
+## 3. Weakness (Limitations of the paper)
+1. it is not clear how to implement policy in the prototype implementation
+
+## 4. Some Insights (Future work)
+1. In this paper, it introduces the idea of assured deletion which is using a control key to encrypt data key
+> this idea can be used in other scenario when we consider assured deletion.
diff --git a/StoragePaperNote/Security/FrequencyHiding-CCS'15.md b/StoragePaperNote/Security/FrequencyHiding-CCS'15.md
old mode 100644
new mode 100755
index 00d1c5b..ed870d3
--- a/StoragePaperNote/Security/FrequencyHiding-CCS'15.md
+++ b/StoragePaperNote/Security/FrequencyHiding-CCS'15.md
@@ -1,41 +1,41 @@
----
-typora-copy-images-to: paper_figure
----
-Frequency-Hiding Order-Preserving Encryption
-------------------------------------------
-| Venue | Category |
-| :----: | :--------------------------------------------------: |
-| CCS'15 | Frequency Encryption, Property Preserving Encryption |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper wants to do a new trade-off. It can clearly increase security while preserving the functionality for most queries relying on the order information.
-
-It needs to increase **client storage size** and introduce **a small error in some queries**.
-
-- Application of order-preserving encryption
-> Order-preserving encryption enables to perform range queries over an encrypted database without any changes to the database management system.
-
-**Security notion**: indistinguishability under *frequency-analyzing*
-
-### Frequency-Hiding Order-preseving Encryption
-1. Main idea: present an order-preserving encryption scheme that is randomized.
-> Repeated plaintexts will (or can) become different ciphertexts.
-
-2. It also employs a number of data compression techniques to reduce the amount of information stored on the client.
-> make this scheme somewhat practical while still improving the security of ciphertexts.
-
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. This paper mentions that if the ciphertexts can approximate a uniform distribution, it would improve against frequency-analyzing attacks.
-
-2. Some related notions in this area
+---
+typora-copy-images-to: paper_figure
+---
+Frequency-Hiding Order-Preserving Encryption
+------------------------------------------
+| Venue | Category |
+| :----: | :--------------------------------------------------: |
+| CCS'15 | Frequency Encryption, Property Preserving Encryption |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper wants to do a new trade-off. It can clearly increase security while preserving the functionality for most queries relying on the order information.
+
+It needs to increase **client storage size** and introduce **a small error in some queries**.
+
+- Application of order-preserving encryption
+> Order-preserving encryption enables to perform range queries over an encrypted database without any changes to the database management system.
+
+**Security notion**: indistinguishability under *frequency-analyzing*
+
+### Frequency-Hiding Order-preseving Encryption
+1. Main idea: present an order-preserving encryption scheme that is randomized.
+> Repeated plaintexts will (or can) become different ciphertexts.
+
+2. It also employs a number of data compression techniques to reduce the amount of information stored on the client.
+> make this scheme somewhat practical while still improving the security of ciphertexts.
+
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. This paper mentions that if the ciphertexts can approximate a uniform distribution, it would improve against frequency-analyzing attacks.
+
+2. Some related notions in this area
> Searchable encryption, homomorphic encryption, functional encryption
\ No newline at end of file
diff --git a/StoragePaperNote/Security/FrequencySmoothing-ICAR'17.md b/StoragePaperNote/Security/FrequencySmoothing-ICAR'17.md
old mode 100644
new mode 100755
index de8a2d6..8801f1e
--- a/StoragePaperNote/Security/FrequencySmoothing-ICAR'17.md
+++ b/StoragePaperNote/Security/FrequencySmoothing-ICAR'17.md
@@ -1,174 +1,174 @@
----
-typora-copy-images-to: ../paper_figure
----
-Frequency-smoothing Encryption: Preventing Snapshot Attacks on Deterministically Encrypted Data
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ICAR'17 | Encryption |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper intends to combat inference attacks of classical frequency analysis via using homophnic encoding.
-> develop the concept of frequency-smoothing encryption (FSE)
-> prevent inference attacks in the snapshot attack model
-
-- Frequency analysis
-If the plaintext distribution is not uniform and an adversary has a reference dataset from which it can compute expected plaintext frequenies
-> then the adversary can access to a snapshot opf the encrypted data
-> try to match frequencies in the encrypted domain with those in plaintext domain
-
-### Frequency-smoothing encryption (FSE)
-- Homophonic encoding
-Goal: flatten the frequency distribution of messages by mapping each plaintext to multiple possible homophones.
-> the number of encodings for each plaintext $m$ ideally being proportional to the frequency of $m$
-> this method wants to ensure that each message has enough homophones to combat frequency (but not so many that they cannot be all be computed on the fly)
-
-- Threat Model
-This paper achieves security against only two forms of attack:
-1. the standard security notion for DE
-2. against inference attacks made by a snapshot attacker
-
-
-- Algorithm
-Design goal: outputs ciphertext whose frequencies are uniform.
-> even an adversary who knows the underlying plaintext frequencies cannot infer anything about the data.
-
-Difference Measurement $\Delta$ (distribution adaptation parameter):
-Use the upper bound on the **Kolomogorov-Smirnov** statistic of the two distributions.
-
-> statistical distance
-> this parameter indicates how much uncertainty is associated with the initial estimated distribution $\hat{D}$.
-> If $\Delta = 0$, it indicates complete confidence, the scheme will be entirely non-adaptive, ie., static.
-
-$$
-FSE =(Setup, KeyGen, Encrypt, Decrypt)
-$$
-
-- Building FSE form HE and DE
-1. first probabilistically encode the messages in a way that smooths the plaintext distribution.
-2. then deterministically encrypt them.
-This paper presents such a two-part, modular construction that
->1. homophonic encoding: smooth frequencies
->2. deterministic symmetric-key encryption: provide privacy
-
-**A Stateful homophnic encoding scheme**
-$$
-HE = (Setup, Encode, Decode)
-$$
-All algorithms and parameters in a homophonic encoding scheme are **keyless**.
-> provide no message privacy.
-> Only consider the fixed-length encoding
-
-In this setting, it considers the indistinguishability of a series of samples from one of two distributions is more appropriate.
-
-
-**Deterministic encryption**
-$$
-DE = (KeyGen, Encrypt, Decrypt)
-$$
-
-
-- Some static HE schemes
-For this kind of scenario, it considers the the case that the data's actual distribution is known to both the data owner and the adversary.
-
-1. Bounding an HE-SMOOTH adversary's advantage
-Since a message $m$'s homophone is chosen uniformly at random, each of its homophones $e$ will have frequency
-$$
-f_{D_s}(e) = \frac{f_{D}(m)}{|H^{HE}(m)|}
-$$
-
-Here, it bounds an $HE-SMOOTH$ adversary;s distinguishing advantage using a result from **Baigneres, Junod, and Vaudenay's statistical framework** for analyzing distinguisher.
-> It shows that error probability of an optimal distinguisher given a number of samples from two close distributions $D_0$ and $D_1$
-> this can be bounded in terms of the **Kullback-Leibler (KL) divergence** of $D_0$ with respect to $D_1$
-
-$$
-KL(D_0, D_1) := \sum_{m \in M} f_{D_0}(m) log\frac{f_{D_0}(m)}{f_{D_1}(m)}
-$$
-
-
-
-2. Interval-based Homophonic Encoding (IBHE)
-Main idea: partitions the set of $r$-bit strings according to the distribution $D$, message $m$ will be allocated an interval of about $f_D(m) \times 2^r$ bitstrings.
-> Each messgae will be replaced by one of its coresponding $r$-bit strings.
-
-One way of partitioning the set of $r$-bit strings according to $D$:
-> the messages in $supp(D)={m_1, m_2, ...}$ are numbered by increasing frequency according to $D$.
-
-$$
-\{2^r \times F_D(m_{i-1}), ..., 2^r \times F_D(m_i) -1\}
-$$
-The encoding algorithm for IBHE simply selects an encoding $e$ of $m_i$ uniformly at random from the relevant interval. This interval has size approximately $2^r \times f_D(m_i)$, as desired.
-
-How to set the encoding bitlength $r$?
-> $r$ must be at least $log_2|supp(D)|$, so each message can have at least one possible encoding.
-> $r$ must be big enough so that each message is assigned a non-empty interval using this partitioning technique.
-
-This paper also proves the bound of $r$ to guarantee that every message $m \in supp(D)$ has at least $h$ homophones
-$$
-r_{min-h} := \lceil max_{1 \leq i \leq |supp(D)|} log_2 \frac{i*h - 0.5}{F_D(m_i)} \rceil
-$$
-
-Some variants with practicality in mind:
->1. append encodings to message rather than entirely replacing them.
->2. modify how intervals (homophone sets) are allocated in such a way that smaller encoding bitlengths are possible. Just assign one homophone to each "too small" message. It does this until each of the remaining messages can be assigned at least one homophone
-
-
-3. Banded homophonic encoding
-Main idea: append tags to messages rather than replacing them entirely.
-> 1. the length of tag is $l \geq 1$
-> 2. each message has at most $2^l$ homophones.
-> 3. each message has a **band** that determines the number of possible tags that can be appended to it and therefore the number of homophones it has.
-
-
-Divide the interval $(0, f_D(m_{|supp(D)|}))$ into $2^l$ bands, and each of width
-$$
-w := \frac{f_D(m_{|supp(D)|})}{2^l}
-$$
-
-This can numbered 1 to $2^l$. The message whose frequencies are in band $i$, in the interval $((i-1)*w, i*w)$, will each have $i$ homophones.
-> the most frequent message $m_{|M|}$, will have $2^l$ homophones, all possible $l$-bit strings can be appended to it.
-
-The main advantages of this banded HE schemes are that there is no minimum tag length and decoding is fast.
-> It does not need any table of frequency information to decode. Just removes the last $l$ bits of the $e$ to remove the original message $m$.
-
-
-- A maximum likelihood attack on static FSE
-The goal of a frequency-smoothing scheme is to smooth the distribution such that it becomes indistinguishable from uniform.
-
-The deterministic decryption functions
-> it cannot map one ciphertext to multiple plaintexts
-
-Map the most frequently occurring ciphertexts (with largest $n(c)$ values) to the messages with the largest "scaled frequencies" $\frac{f_D(m)}{|H^{FSE}(m)|}$
-
-
-The adversary's goal is to find the correct **many-to-one** decryption mapping $\theta: C \rightarrow M$
-
-
-
-### Implementation and Evaluation
-
-- Implementation
-Client:
-> the state $s$ of an FSE scheme is stored locally at the client.
-> a proxy that transparently performs the encryption and decryption operations.
-
-Do the simulation of FSE and then attacking the FSE scheme
-
-- Evaluation
-The security metric this paper work with is the number of the data items that an attacker can correctly decrypt, which has been used for assessing the effectiveness of inference attacks in the literature (CCS'15)
-
-Compared with the attack
-> it applies the technique of maximum likelihood estimation (MLE) to derive an efficient attack on a static FSE scheme
-
-Goal: prove attacking FSE is hard, in particular, at least as hard as attacking DE.
-
-
-## 2. Strength (Contributions of the paper)
-1. The key question: whether this trade-off between preventing frequency leakage and increasing query complexity is beneficial.
-> this paper shows that the answer to this question is positive, at least for certain distributions.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: ../paper_figure
+---
+Frequency-smoothing Encryption: Preventing Snapshot Attacks on Deterministically Encrypted Data
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ICAR'17 | Encryption |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper intends to combat inference attacks of classical frequency analysis via using homophnic encoding.
+> develop the concept of frequency-smoothing encryption (FSE)
+> prevent inference attacks in the snapshot attack model
+
+- Frequency analysis
+If the plaintext distribution is not uniform and an adversary has a reference dataset from which it can compute expected plaintext frequenies
+> then the adversary can access to a snapshot opf the encrypted data
+> try to match frequencies in the encrypted domain with those in plaintext domain
+
+### Frequency-smoothing encryption (FSE)
+- Homophonic encoding
+Goal: flatten the frequency distribution of messages by mapping each plaintext to multiple possible homophones.
+> the number of encodings for each plaintext $m$ ideally being proportional to the frequency of $m$
+> this method wants to ensure that each message has enough homophones to combat frequency (but not so many that they cannot be all be computed on the fly)
+
+- Threat Model
+This paper achieves security against only two forms of attack:
+1. the standard security notion for DE
+2. against inference attacks made by a snapshot attacker
+
+
+- Algorithm
+Design goal: outputs ciphertext whose frequencies are uniform.
+> even an adversary who knows the underlying plaintext frequencies cannot infer anything about the data.
+
+Difference Measurement $\Delta$ (distribution adaptation parameter):
+Use the upper bound on the **Kolomogorov-Smirnov** statistic of the two distributions.
+
+> statistical distance
+> this parameter indicates how much uncertainty is associated with the initial estimated distribution $\hat{D}$.
+> If $\Delta = 0$, it indicates complete confidence, the scheme will be entirely non-adaptive, ie., static.
+
+$$
+FSE =(Setup, KeyGen, Encrypt, Decrypt)
+$$
+
+- Building FSE form HE and DE
+1. first probabilistically encode the messages in a way that smooths the plaintext distribution.
+2. then deterministically encrypt them.
+This paper presents such a two-part, modular construction that
+>1. homophonic encoding: smooth frequencies
+>2. deterministic symmetric-key encryption: provide privacy
+
+**A Stateful homophnic encoding scheme**
+$$
+HE = (Setup, Encode, Decode)
+$$
+All algorithms and parameters in a homophonic encoding scheme are **keyless**.
+> provide no message privacy.
+> Only consider the fixed-length encoding
+
+In this setting, it considers the indistinguishability of a series of samples from one of two distributions is more appropriate.
+
+
+**Deterministic encryption**
+$$
+DE = (KeyGen, Encrypt, Decrypt)
+$$
+
+
+- Some static HE schemes
+For this kind of scenario, it considers the the case that the data's actual distribution is known to both the data owner and the adversary.
+
+1. Bounding an HE-SMOOTH adversary's advantage
+Since a message $m$'s homophone is chosen uniformly at random, each of its homophones $e$ will have frequency
+$$
+f_{D_s}(e) = \frac{f_{D}(m)}{|H^{HE}(m)|}
+$$
+
+Here, it bounds an $HE-SMOOTH$ adversary;s distinguishing advantage using a result from **Baigneres, Junod, and Vaudenay's statistical framework** for analyzing distinguisher.
+> It shows that error probability of an optimal distinguisher given a number of samples from two close distributions $D_0$ and $D_1$
+> this can be bounded in terms of the **Kullback-Leibler (KL) divergence** of $D_0$ with respect to $D_1$
+
+$$
+KL(D_0, D_1) := \sum_{m \in M} f_{D_0}(m) log\frac{f_{D_0}(m)}{f_{D_1}(m)}
+$$
+
+
+
+2. Interval-based Homophonic Encoding (IBHE)
+Main idea: partitions the set of $r$-bit strings according to the distribution $D$, message $m$ will be allocated an interval of about $f_D(m) \times 2^r$ bitstrings.
+> Each messgae will be replaced by one of its coresponding $r$-bit strings.
+
+One way of partitioning the set of $r$-bit strings according to $D$:
+> the messages in $supp(D)={m_1, m_2, ...}$ are numbered by increasing frequency according to $D$.
+
+$$
+\{2^r \times F_D(m_{i-1}), ..., 2^r \times F_D(m_i) -1\}
+$$
+The encoding algorithm for IBHE simply selects an encoding $e$ of $m_i$ uniformly at random from the relevant interval. This interval has size approximately $2^r \times f_D(m_i)$, as desired.
+
+How to set the encoding bitlength $r$?
+> $r$ must be at least $log_2|supp(D)|$, so each message can have at least one possible encoding.
+> $r$ must be big enough so that each message is assigned a non-empty interval using this partitioning technique.
+
+This paper also proves the bound of $r$ to guarantee that every message $m \in supp(D)$ has at least $h$ homophones
+$$
+r_{min-h} := \lceil max_{1 \leq i \leq |supp(D)|} log_2 \frac{i*h - 0.5}{F_D(m_i)} \rceil
+$$
+
+Some variants with practicality in mind:
+>1. append encodings to message rather than entirely replacing them.
+>2. modify how intervals (homophone sets) are allocated in such a way that smaller encoding bitlengths are possible. Just assign one homophone to each "too small" message. It does this until each of the remaining messages can be assigned at least one homophone
+
+
+3. Banded homophonic encoding
+Main idea: append tags to messages rather than replacing them entirely.
+> 1. the length of tag is $l \geq 1$
+> 2. each message has at most $2^l$ homophones.
+> 3. each message has a **band** that determines the number of possible tags that can be appended to it and therefore the number of homophones it has.
+
+
+Divide the interval $(0, f_D(m_{|supp(D)|}))$ into $2^l$ bands, and each of width
+$$
+w := \frac{f_D(m_{|supp(D)|})}{2^l}
+$$
+
+This can numbered 1 to $2^l$. The message whose frequencies are in band $i$, in the interval $((i-1)*w, i*w)$, will each have $i$ homophones.
+> the most frequent message $m_{|M|}$, will have $2^l$ homophones, all possible $l$-bit strings can be appended to it.
+
+The main advantages of this banded HE schemes are that there is no minimum tag length and decoding is fast.
+> It does not need any table of frequency information to decode. Just removes the last $l$ bits of the $e$ to remove the original message $m$.
+
+
+- A maximum likelihood attack on static FSE
+The goal of a frequency-smoothing scheme is to smooth the distribution such that it becomes indistinguishable from uniform.
+
+The deterministic decryption functions
+> it cannot map one ciphertext to multiple plaintexts
+
+Map the most frequently occurring ciphertexts (with largest $n(c)$ values) to the messages with the largest "scaled frequencies" $\frac{f_D(m)}{|H^{FSE}(m)|}$
+
+
+The adversary's goal is to find the correct **many-to-one** decryption mapping $\theta: C \rightarrow M$
+
+
+
+### Implementation and Evaluation
+
+- Implementation
+Client:
+> the state $s$ of an FSE scheme is stored locally at the client.
+> a proxy that transparently performs the encryption and decryption operations.
+
+Do the simulation of FSE and then attacking the FSE scheme
+
+- Evaluation
+The security metric this paper work with is the number of the data items that an attacker can correctly decrypt, which has been used for assessing the effectiveness of inference attacks in the literature (CCS'15)
+
+Compared with the attack
+> it applies the technique of maximum likelihood estimation (MLE) to derive an efficient attack on a static FSE scheme
+
+Goal: prove attacking FSE is hard, in particular, at least as hard as attacking DE.
+
+
+## 2. Strength (Contributions of the paper)
+1. The key question: whether this trade-off between preventing frequency leakage and increasing query complexity is beneficial.
+> this paper shows that the answer to this question is positive, at least for certain distributions.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/Security/ORAM-CCSW'16.md b/StoragePaperNote/Security/ORAM-CCSW'16.md
old mode 100644
new mode 100755
index 1b9b93f..9880bf6
--- a/StoragePaperNote/Security/ORAM-CCSW'16.md
+++ b/StoragePaperNote/Security/ORAM-CCSW'16.md
@@ -1,78 +1,78 @@
----
-typora-copy-images-to: ../paper_figure
----
-Oblivious RAM as a Substrate for Cloud Storage - The Leakage Challenge Ahead
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CCSW'16 | ORAM |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- ORAM Storage Interface
-the atomic unit of storage and access in ORAM algorithms has been the **block**. Allow a ORAM client to **read** and **write** to block addresses.
-> the untrusted storage server cannot learn the plaintext of user content, the requested addresses, nor the relationships between requested addresses.
-
-- This paper investigates the effects of the RAM interface mismatch between ORAM algorithms and cloud storage.
-> the block-oriented interface of ORAM can be also problematic for cloud applications in terms of leakage.
-
-### ORAM side-channel
-- Threat Model
-the remote server is "honest but curious"
-> correctly follows the protocol
-> attempts to gain as much knowledge as possible by direct observation of the data access pattern.
-
-- Metrics
-1. Bandwidth Efficiency
-$$
-BE = \frac{\text{Total Data to access a set of file}}{\text{Total size of files}}
-$$
-
-
-Given a set of $M$ files $F=\{f_1, f_2, ..., f_M\}$. It also considers the **access probability** of
-file $i$
-$$
-\sum_{i=1}^M P(f_i) =1
-$$
-
-2. Privacy Leakage
-Current ORAM schemes operate at the block level. They may leak bits of information about the secret input when accesses occur at a **higher granularity (file)**.
-> Every file access translates into a batch of random block access.
-
-If the server sees a batch of size $b$ blocks, then it knows that this file belongs to class $F_b$, this can tell the server some information about the access pattern.
-> measure the bit leakage by comparing the uncertainty of the server about $F$ before and after observing $B$
-$$
-\text{leakage} = \text{initial uncertainty} - \text{remaining uncertainty}
-$$
-
-It uses the posteriori probability to define this uncertainty.
-
-The root cause of this leakage is the mapping between files and batches is **deterministic**.
-
-
-- Tradeoff between information leakage and performance efficiency
-Achieve different tradeoff points via varying the size of block.
-
-- How to mitigate this information leakage
-1. Maximizing block size (naive strategy)
-restrict the size of all batches to 1 by choosing as the block size the size of the largest file.
-> personal storage systems such Dropbox, exhibit high variability in file size. (fitted by heavy-tailed distributions)
-> make it impractical to predict the size of the largest file in advance.
-
-
-2. Periodic ORAM access
-By accessing the ORAM at a periodic rate, it is possible to fully obfuscate the actual number of file blocks, because the server cannot tell **when** a request for a file starts and terminates.
-> may harm the performance since high overhead
-> how to set the periodic rate?
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. this paper shows a formal definition of the information leakage in this problem.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. This work also investigates how different block sizes affects the degree of information leakage in ORAM
-> this can also extend to how different chunk sizes affects the degree of information leakage in deduplication system.
+---
+typora-copy-images-to: ../paper_figure
+---
+Oblivious RAM as a Substrate for Cloud Storage - The Leakage Challenge Ahead
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| CCSW'16 | ORAM |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- ORAM Storage Interface
+the atomic unit of storage and access in ORAM algorithms has been the **block**. Allow a ORAM client to **read** and **write** to block addresses.
+> the untrusted storage server cannot learn the plaintext of user content, the requested addresses, nor the relationships between requested addresses.
+
+- This paper investigates the effects of the RAM interface mismatch between ORAM algorithms and cloud storage.
+> the block-oriented interface of ORAM can be also problematic for cloud applications in terms of leakage.
+
+### ORAM side-channel
+- Threat Model
+the remote server is "honest but curious"
+> correctly follows the protocol
+> attempts to gain as much knowledge as possible by direct observation of the data access pattern.
+
+- Metrics
+1. Bandwidth Efficiency
+$$
+BE = \frac{\text{Total Data to access a set of file}}{\text{Total size of files}}
+$$
+
+
+Given a set of $M$ files $F=\{f_1, f_2, ..., f_M\}$. It also considers the **access probability** of
+file $i$
+$$
+\sum_{i=1}^M P(f_i) =1
+$$
+
+2. Privacy Leakage
+Current ORAM schemes operate at the block level. They may leak bits of information about the secret input when accesses occur at a **higher granularity (file)**.
+> Every file access translates into a batch of random block access.
+
+If the server sees a batch of size $b$ blocks, then it knows that this file belongs to class $F_b$, this can tell the server some information about the access pattern.
+> measure the bit leakage by comparing the uncertainty of the server about $F$ before and after observing $B$
+$$
+\text{leakage} = \text{initial uncertainty} - \text{remaining uncertainty}
+$$
+
+It uses the posteriori probability to define this uncertainty.
+
+The root cause of this leakage is the mapping between files and batches is **deterministic**.
+
+
+- Tradeoff between information leakage and performance efficiency
+Achieve different tradeoff points via varying the size of block.
+
+- How to mitigate this information leakage
+1. Maximizing block size (naive strategy)
+restrict the size of all batches to 1 by choosing as the block size the size of the largest file.
+> personal storage systems such Dropbox, exhibit high variability in file size. (fitted by heavy-tailed distributions)
+> make it impractical to predict the size of the largest file in advance.
+
+
+2. Periodic ORAM access
+By accessing the ORAM at a periodic rate, it is possible to fully obfuscate the actual number of file blocks, because the server cannot tell **when** a request for a file starts and terminates.
+> may harm the performance since high overhead
+> how to set the periodic rate?
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. this paper shows a formal definition of the information leakage in this problem.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. This work also investigates how different block sizes affects the degree of information leakage in ORAM
+> this can also extend to how different chunk sizes affects the degree of information leakage in deduplication system.
diff --git a/StoragePaperNote/Security/OverheadConfidentiality-UCC'19.md b/StoragePaperNote/Security/OverheadConfidentiality-UCC'19.md
old mode 100644
new mode 100755
index a7a0e9c..179965c
--- a/StoragePaperNote/Security/OverheadConfidentiality-UCC'19.md
+++ b/StoragePaperNote/Security/OverheadConfidentiality-UCC'19.md
@@ -1,71 +1,71 @@
----
-typora-copy-images-to: ../paper_figure
----
-The Overhead of Confidentiality and Client-side Encryption in Cloud Storage Systems
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| UCC'19 | Data Encryption |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-
-Many cloud storage services are fairly blunt regarding the lack of confidentiality they provide.
-> a solution to provide confidential cloud storage is to use client-side encryption (CSE).
-
-- The issue of CSE
-CSE complicates file synchronization techniques, such as deduplication and delta encoding, commonly used to reduce the traffic overheads associated with personal cloud storage systems.
-
-- The goal of this paper
-this paper presents empirical experiments and analysis of CSE-related overheads.
-> compare and contrast the security and bandwidth saving features implemented by CSE services and non-CSE services.
-> compression, delta encoding, and deduplication
-
-### Data confidentiality overhead
-This paper focuses on pure CSEs.
-
-- Service
-1. CSEs
-Mega, Sync.com, SpiderOak, Tresorit
-> Mega: AES-128
-> SpiderOak: AES-256-CFB
-> Password-Based Key Derivation Function 2 (PBKDF2)
-
-2. non-CSEs
-Dropbox, iCloud, Google Drive, Microsoft OneDrive
-> This was selected base on recommendations in online reviews
-> https://www.cloudwards.net/comparison/
-
-- Baseline methodology
-adding files to the cloud services' sync folders and performing targeted system and network measurements during the sync process.
-
-1. Network traffic
-using Python modules **netifaces** and **pcapy** among others.
-2. CPU memory overhead
-using Python modules **psutil**
-
-
-- Bandwidth saving feature
-
-1. Client-side deduplication
-Dropbox, iCloud, Mega, SpiderOak, and Sync.com
-2. Other
-Google drive, One Drive and Tresorit
-
-
-
-### Implementation and Evaluation
-
-- Evaluation
-Setting: Macbook Air, high-speed university network through 10Gb/s.
-
-## 2. Strength (Contributions of the paper)
-
-1. This paper presents a comprehensive analysis of state-of-art cloud storage services
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
-1. how to combine delta encoding with CSEs
+---
+typora-copy-images-to: ../paper_figure
+---
+The Overhead of Confidentiality and Client-side Encryption in Cloud Storage Systems
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| UCC'19 | Data Encryption |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+
+Many cloud storage services are fairly blunt regarding the lack of confidentiality they provide.
+> a solution to provide confidential cloud storage is to use client-side encryption (CSE).
+
+- The issue of CSE
+CSE complicates file synchronization techniques, such as deduplication and delta encoding, commonly used to reduce the traffic overheads associated with personal cloud storage systems.
+
+- The goal of this paper
+this paper presents empirical experiments and analysis of CSE-related overheads.
+> compare and contrast the security and bandwidth saving features implemented by CSE services and non-CSE services.
+> compression, delta encoding, and deduplication
+
+### Data confidentiality overhead
+This paper focuses on pure CSEs.
+
+- Service
+1. CSEs
+Mega, Sync.com, SpiderOak, Tresorit
+> Mega: AES-128
+> SpiderOak: AES-256-CFB
+> Password-Based Key Derivation Function 2 (PBKDF2)
+
+2. non-CSEs
+Dropbox, iCloud, Google Drive, Microsoft OneDrive
+> This was selected base on recommendations in online reviews
+> https://www.cloudwards.net/comparison/
+
+- Baseline methodology
+adding files to the cloud services' sync folders and performing targeted system and network measurements during the sync process.
+
+1. Network traffic
+using Python modules **netifaces** and **pcapy** among others.
+2. CPU memory overhead
+using Python modules **psutil**
+
+
+- Bandwidth saving feature
+
+1. Client-side deduplication
+Dropbox, iCloud, Mega, SpiderOak, and Sync.com
+2. Other
+Google drive, One Drive and Tresorit
+
+
+
+### Implementation and Evaluation
+
+- Evaluation
+Setting: Macbook Air, high-speed university network through 10Gb/s.
+
+## 2. Strength (Contributions of the paper)
+
+1. This paper presents a comprehensive analysis of state-of-art cloud storage services
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
+1. how to combine delta encoding with CSEs
the development of optimized delta encoding policies for CSEs, which minimize the bandwidth and storage overhead associated with CSE.
\ No newline at end of file
diff --git a/StoragePaperNote/Security/QuantifyingInformationLeakage-CCSW'19.md b/StoragePaperNote/Security/QuantifyingInformationLeakage-CCSW'19.md
old mode 100644
new mode 100755
index 4ef56a5..ec430c8
--- a/StoragePaperNote/Security/QuantifyingInformationLeakage-CCSW'19.md
+++ b/StoragePaperNote/Security/QuantifyingInformationLeakage-CCSW'19.md
@@ -1,57 +1,57 @@
----
-typora-copy-images-to: ../paper_figure
----
-Quantifying Information Leakage of Deterministic Encryption
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| CCSW'19 | Deterministic Encryption |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Deterministic encryption
-ensures that the same plaintext encrypted under the same key will produce the same ciphertext.
-> enable clients to make queries on sensitive data.
-> the security implications of deterministic encryption are not well understood.
-
-
-- Deterministic encryption is controversial
- - CryptoDB claims that: deterministic encryption is safe to use for sensitive field if every value in a column appears only once.
- - Deterministic encryption for non-unique fields is described as **allowing some leakage**. (there does not currently exist a clear understanding of the leakage)
-
-This paper provides a leakage analysis of deterministic encryption through the application of the framework of **quantitative information flow**.
-
-### Quantitative Information Flow (QIF)
-- Key Insight:
-there is no one "correct" way to measure leakage without a given operational scenario
-> different operational scenarios require different leakage measures.
-
-- QIF definition
- - prior g-vulnerability
- - posterior g-vulnerability
-
-
-it is natural to measure the leakage of the channel by comparing the prior g-vulnerability with the posterior g-vulnerability.
-
-- Model of deterministic encryption
- - Bayes vulnerability
-the adversary attempts to guess the entire column. consider different distributions(three kinds of values):
-> 1. uniform distribution
-> 2. an arbitrarily chosen non-uniform distribution
-> 3. a distribution in which two values has the same probability
-> 4. a distribution in which two values' probabilities are very close but not the same.
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. This paper provides a comprehensive information leakage analysis via considering different distribution and different operational scenarios of the adversary.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights
-1. This paper mentions the way to mitigate inference attack by inserting fake entries prior to uploading the database to the cloud.
-> For encrypted deduplication, can we insert fake chunk to the original workload to mitigate the attack?
-
-2. This paper considers different distribution and analyzes the information leakage in different cases.
+---
+typora-copy-images-to: ../paper_figure
+---
+Quantifying Information Leakage of Deterministic Encryption
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| CCSW'19 | Deterministic Encryption |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Deterministic encryption
+ensures that the same plaintext encrypted under the same key will produce the same ciphertext.
+> enable clients to make queries on sensitive data.
+> the security implications of deterministic encryption are not well understood.
+
+
+- Deterministic encryption is controversial
+ - CryptoDB claims that: deterministic encryption is safe to use for sensitive field if every value in a column appears only once.
+ - Deterministic encryption for non-unique fields is described as **allowing some leakage**. (there does not currently exist a clear understanding of the leakage)
+
+This paper provides a leakage analysis of deterministic encryption through the application of the framework of **quantitative information flow**.
+
+### Quantitative Information Flow (QIF)
+- Key Insight:
+there is no one "correct" way to measure leakage without a given operational scenario
+> different operational scenarios require different leakage measures.
+
+- QIF definition
+ - prior g-vulnerability
+ - posterior g-vulnerability
+
+
+it is natural to measure the leakage of the channel by comparing the prior g-vulnerability with the posterior g-vulnerability.
+
+- Model of deterministic encryption
+ - Bayes vulnerability
+the adversary attempts to guess the entire column. consider different distributions(three kinds of values):
+> 1. uniform distribution
+> 2. an arbitrarily chosen non-uniform distribution
+> 3. a distribution in which two values has the same probability
+> 4. a distribution in which two values' probabilities are very close but not the same.
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. This paper provides a comprehensive information leakage analysis via considering different distribution and different operational scenarios of the adversary.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights
+1. This paper mentions the way to mitigate inference attack by inserting fake entries prior to uploading the database to the cloud.
+> For encrypted deduplication, can we insert fake chunk to the original workload to mitigate the attack?
+
+2. This paper considers different distribution and analyzes the information leakage in different cases.
> In our paper, we do not provide a fine-grain information leakage analysis, we just use KLD as a coarse measurement for information leakage.
\ No newline at end of file
diff --git a/StoragePaperNote/SGX Storage/SPEED-ICDCS'19.md b/StoragePaperNote/Security/SGX-Storage/SPEED-ICDCS'19.md
old mode 100644
new mode 100755
similarity index 97%
rename from StoragePaperNote/SGX Storage/SPEED-ICDCS'19.md
rename to StoragePaperNote/Security/SGX-Storage/SPEED-ICDCS'19.md
index b0fcbc4..76aab1c
--- a/StoragePaperNote/SGX Storage/SPEED-ICDCS'19.md
+++ b/StoragePaperNote/Security/SGX-Storage/SPEED-ICDCS'19.md
@@ -1,106 +1,106 @@
----
-typora-copy-images-to: ../paper_figure
----
-SPEED: Accelerating Enclave Applications via Secure Deduplication
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ICDCS'19 | Computation Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
- - Existing approaches to accelerate applications in the context of Intel SGX.
- - asynchronous system calls.
- - exit-less remote procedure calls: mitigate the cost of enclave exit.
-
- - A new angle to optimize the performance of applications in SGX
- - cache or reuse the results of previous computations.
- - computation deduplication in the context of SGX is till a non-trivial task.
- - a particular computation is **deterministic** yet **time-consuming**.
- - it would be more efficient to cache and reuse the same result rather than re-computing it.
-
-
-- Main issues
- - 1. bind a particular computation to its result
- - consider both function's code and input data
- - 2. manage reusable results efficiently and securely
- - small TCB size
- - store data outside the enclave
- - 3. share these encrypted results between different applications
- - MLE
- - without sharing a system-wide secret key among all applications, which is vulnerable to the potential single point of compromise.
-
-### SPEED
-
-- Architecture
- - DedupRuntime: a secure deduplication runtime as a *trusted library*
- - ResultStore: a generic encrypted result store
- - Developers: the developer are willing to harden their applications with the recent advancement in hardware-assisted security technologies.
-
-
-
-- Threat model
- - a powerful adversary: control the software stack of physical machines, including hypervisor and OS.
- - TEE cannot be compromised, do not consider the side-channel attacks.
- - protect: the function's code, input data, and computation results.
- - may leakage whether an intended computation has been done before.
-
-- Whole workflow
- - DedupRuntime:
- - Does not require to share a system-wide secret key in advance.
- - $tag \leftarrow Hash(func, m)$, for function $func(m)$
- - 
- - 
- - intercepting marked function calls, querying ResultStore, and retrieving the possible computation results.
- - ResultStore:
- - enclave-protected dictionary
- - keep only small-sized metadata inside the enclaves (pointers)
- - storing the actual content outside
-
-
-
-### Implementation and Evaluation
-- Implementation
- - C++, SGX SDK
-
-- Datasets
- - computer version
- - data compression
-
-- Evaluation
- - developer effort
- - only requires very little modifications as few as 2 lines
- - application performance
- - without using SPEED
- - initial computation with SPEED
- - subsequent computation with using SPEED
- - cryptographic operations
- - tag generation is the bottleneck
-
-## 2. Strength (Contributions of the paper)
-1. propose the first secure and generic computation deduplication system SPEED.
-> protect and reuse computation results across multiple applications.
-
-2. implement a fully functional prototype with Intel SGX SDK.
-> open-source in github
-
-## 3. Weakness (Limitations of the paper)
-1. do not test the dataset with a large size.
-> performance is poor with the data is large
-
-
-## 4. Some Insights (Future work)
-
-- SGX background
- - Intel SGX
- - it is popular in academia and industry.
- - AMD TrustZone
-
-- SGX research direction
- - how to reduce the performance overhead of the SGX
- - eliminate the expensive context switches in enclave application
- - design more efficient memory management scheme
- - maintain user-level page table in enclave for exit-less paging
+---
+typora-copy-images-to: ../paper_figure
+---
+SPEED: Accelerating Enclave Applications via Secure Deduplication
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ICDCS'19 | Computation Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - Existing approaches to accelerate applications in the context of Intel SGX.
+ - asynchronous system calls.
+ - exit-less remote procedure calls: mitigate the cost of enclave exit.
+
+ - A new angle to optimize the performance of applications in SGX
+ - cache or reuse the results of previous computations.
+ - computation deduplication in the context of SGX is till a non-trivial task.
+ - a particular computation is **deterministic** yet **time-consuming**.
+ - it would be more efficient to cache and reuse the same result rather than re-computing it.
+
+
+- Main issues
+ - 1. bind a particular computation to its result
+ - consider both function's code and input data
+ - 2. manage reusable results efficiently and securely
+ - small TCB size
+ - store data outside the enclave
+ - 3. share these encrypted results between different applications
+ - MLE
+ - without sharing a system-wide secret key among all applications, which is vulnerable to the potential single point of compromise.
+
+### SPEED
+
+- Architecture
+ - DedupRuntime: a secure deduplication runtime as a *trusted library*
+ - ResultStore: a generic encrypted result store
+ - Developers: the developer are willing to harden their applications with the recent advancement in hardware-assisted security technologies.
+
+
+
+- Threat model
+ - a powerful adversary: control the software stack of physical machines, including hypervisor and OS.
+ - TEE cannot be compromised, do not consider the side-channel attacks.
+ - protect: the function's code, input data, and computation results.
+ - may leakage whether an intended computation has been done before.
+
+- Whole workflow
+ - DedupRuntime:
+ - Does not require to share a system-wide secret key in advance.
+ - $tag \leftarrow Hash(func, m)$, for function $func(m)$
+ - 
+ - 
+ - intercepting marked function calls, querying ResultStore, and retrieving the possible computation results.
+ - ResultStore:
+ - enclave-protected dictionary
+ - keep only small-sized metadata inside the enclaves (pointers)
+ - storing the actual content outside
+
+
+
+### Implementation and Evaluation
+- Implementation
+ - C++, SGX SDK
+
+- Datasets
+ - computer version
+ - data compression
+
+- Evaluation
+ - developer effort
+ - only requires very little modifications as few as 2 lines
+ - application performance
+ - without using SPEED
+ - initial computation with SPEED
+ - subsequent computation with using SPEED
+ - cryptographic operations
+ - tag generation is the bottleneck
+
+## 2. Strength (Contributions of the paper)
+1. propose the first secure and generic computation deduplication system SPEED.
+> protect and reuse computation results across multiple applications.
+
+2. implement a fully functional prototype with Intel SGX SDK.
+> open-source in github
+
+## 3. Weakness (Limitations of the paper)
+1. do not test the dataset with a large size.
+> performance is poor with the data is large
+
+
+## 4. Some Insights (Future work)
+
+- SGX background
+ - Intel SGX
+ - it is popular in academia and industry.
+ - AMD TrustZone
+
+- SGX research direction
+ - how to reduce the performance overhead of the SGX
+ - eliminate the expensive context switches in enclave application
+ - design more efficient memory management scheme
+ - maintain user-level page table in enclave for exit-less paging
- deduplicating the repeated and often time-consuming computation
\ No newline at end of file
diff --git a/StoragePaperNote/Security/SGX-Storage/SPEICHER-FAST'19.md b/StoragePaperNote/Security/SGX-Storage/SPEICHER-FAST'19.md
new file mode 100755
index 0000000..ec849b3
--- /dev/null
+++ b/StoragePaperNote/Security/SGX-Storage/SPEICHER-FAST'19.md
@@ -0,0 +1,128 @@
+---
+typora-copy-images-to: ../paper_figure
+---
+SPEICHER: Securing LSM-based Key-Value Stores using Shielded Execution
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'19 | SGX-DB |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+
+- Motivation
+ - Persistent KV stores have become a fundamental part of the cloud infrastructure.
+ - The security vulnerabilities in third-party cloud pose a serious threat to storage system
+ - securing a storage system is quite challenging, since various layers in the system stack.
+ - SGX provides an appealing approach to build secure systems.
+ - aims to provide strong security properties using the enclave.
+
+- Challenge
+ - Shielded execution are primarily designed for securing "**stateless**" in-memory computations and data.
+ - not sufficient for building a secure storage system.
+ - how to extend the trust beyond the "secure, but stateless" enclave memory region to the "untrusted and persistent" storage medium, while ensuring that the security properties and preserved in the "stateful setting"
+ - Even across the system reboot, migration, or crash.
+
+- Goal: confidentiality, integrity, and freshness.
+
+
+
+### SPEICHER
+
+- Threat model
+ - In addition to the standard SGX threat model, it also considers the security attacks that can be launched using an **untrusted storage medium**.
+ - The adversary can control the entire system software stack, including the OS or hypervisor, and is able to launch physical attacks
+ - performing memory probes.
+
+- Design challenges
+ - Limited EPC size: the paging incurs high performance overheads.
+ - Untrusted storage medium: need to extend the trust to the untrusted storage medium.
+ - Expensive I/O syscall: incur higher performance overhead. (TLB flushes, security check)
+ - Trusted counter:
+ - the SGX counters are impractical to design a storage system.
+
+- System architecture
+
+
+ - Controller
+ - The TLS channel is terminated inside the controller (inside the enclave)
+ - The controller performs the remote attestation service to the clients, and access control operations
+ - Shielded I/O library
+ - SPDK's device driver runs inside the enclave.
+ - Trusted counter
+ - defer the counter increment until the data is persisted without loosing any availability guarantees.
+ - MemTable
+ - devise a mechanism to ensure the confidentiality, integrity, and freshness of the MemTable.
+ - partition the existing MemTable in two parts: key path and value path.
+ - encrypting the value, enabling fast key lookups inside the enclave.
+ - also store the hash of the value inside the enclave.
+ - can reduce the space size
+ - 
+ - SSTable
+ - maintain the KV pairs persistently
+ - group KV pairs together into blocks.
+ - encrypt each block, calculate a hash over each block
+ - these hashes are then grouped together in a block of hashes and appended at the end of the SSTable file (append at the end of the SSTable file.)
+ - all hashes are grouped together in a block of hashes an appended at the end of the SSTable file.
+
+- Operation
+ - Put: first encrypts the value of the KV pair and generates a hash over the encrypted data.
+ - the encrypted value is copied to the untrusted host memory
+ - the hash with a pointer to the value is inserted into the skip list in the enclave.
+
+- Optimizations
+ - timer performance: prevent every request from blocking for the trusted counter increment.
+ - SPDK performance: add a cache within the enclave.
+ - OpenSSL AES-128-GCM: en-/decryption, HMAC
+ - 16B wide HMAC.
+
+
+### Implementation and Evaluation
+- Evaluation
+ - Compare with unmodified RocksDB
+ - stress-test the system by running a client on the same machine as the KV store.
+
+- Performance of the Direct I/O library
+ - SPDK v.s. native SODK
+
+- Impact of the EPC paging on MemTable
+ - MemTable completely resident in the enclave v.s. split the key and value.
+
+- Throughput and latency measurements
+ - Effect of varying byte sizes
+ - Effect of varying workloads
+ - Effect of varying threads
+ - Latency measurements
+
+- Performance of the Trusted counter
+- I/O amplification
+
+## 2. Strength (Contributions of the paper)
+1. I/O library for shielded execution (for performance)
+a direct I/O library for shielded execution based on Intel SPDK
+> perform the I/O operations without existing the secure enclave.
+
+2. Asynchronous trusted monotonic counter
+use the lag in the syn operation in modern KV stores to asynchronously update the counters.
+> overcome the limitation of the native SGX counters.
+
+3. Secure LSM data structure
+reside outside of the enclave memory while ensuring the integrity, confidentiality and freshness of the data.
+
+4. The whole prototype is built on RocksDB with reasonable overheads.
+built on SCONE shielded execution framework
+> modified standard C library (SCONE libc)
+
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. SPDK
+SPDK enables zero-copy I/O by mapping DMA buffers to the user address space.
+> relies on actively polling the device instead of interrupts
+
+2. Do not use Merkle trees
+For its MemTable design, it argues that using Merkle tree can allow the MemTable to be stored outside the EPC memory
+> can verify the data integrity by checking the root node hash and each hash down to the leak storing the KV.
+> slow lookup: the key has to be decrypted on each traversal.
\ No newline at end of file
diff --git a/StoragePaperNote/SGX Storage/SeGShare-DSN'20.md b/StoragePaperNote/Security/SGX-Storage/SeGShare-DSN'20.md
old mode 100644
new mode 100755
similarity index 97%
rename from StoragePaperNote/SGX Storage/SeGShare-DSN'20.md
rename to StoragePaperNote/Security/SGX-Storage/SeGShare-DSN'20.md
index b1a53fe..466d9a9
--- a/StoragePaperNote/SGX Storage/SeGShare-DSN'20.md
+++ b/StoragePaperNote/Security/SGX-Storage/SeGShare-DSN'20.md
@@ -1,120 +1,120 @@
----
-typora-copy-images-to: ../paper_figure
----
-SeGShare: Secure Group File Sharing in the Cloud using Enclaves
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| DSN'20 | SGX Storage |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-
-- Present a new architecture for end-to-end encrypted, group-based file sharing using Intel SGX
- - protect the confidentiality and integrity of all data and management files
- - overhead is extremely small in computation and storage resources.
-
-- the drawbacks of current solutions
- - For permission revocation: it is necessary to **re-encrypt** the file with a new key and distribute the new key to many users (*involves expensive cryptographic operations*)
- - the users can gain plaintext access to the **file key**.
-
-- Key reason for throughput improvement
- - does not require complex cryptographic operations on permission or membership changes.
- - build an efficient SGX-enable TLS stack, use *switchless* enclave calls for all network and file traffic.
-
-
-### File Sharing System
-
-- Attacker Model
- - CA is trusted: an attacker controlling multiple users should only have permissions according to the union of permissions of the individual controlled users.
- - Malicious cloud provider: monitor/change data on disk or in memory, view all network communications
- - do not protect the number of files, the file sizes, and the file access pattern
-- Group-based permission
- - a user is a member of a group, and a group has some permissions for files
- - main benefit: a membership update is sufficient to provide or revoke a user's access to many files instead of changing the permissions of all affected files individually.
-
-- System Design
-
-
-
-1. Setup: establish trust between users and enclave
-Establishes bilateral trust between each user $u \in U$ and the enclave running at the cloud provider.
-> Establish user trust in enclave
-> Establish enclave trust in users
-
-2. Access control
-This component is responsible to relation update and access control check
-> It uses the file manager components to *read* and *write* the required relations.
-> It is the key to enable dynamic groups without **re-encryption**.
-
-3. File managers
-It contains two parts:
-> **trusted file manager**: encrypt/decrypt the content of all files that should be written/read with *PAE_Enc/PAE_Dec* using a unique file key $SK_f$ per file. ($SK_f$ is derived from a root key $SK_r$)
-> **untrusted file manager**: passed/received all encrypted data, and handle the actual memory access.
-
-*Content store*: regular files and its corresponding ACL files
-*Group store*: group list files and member list files
-
-- Immediate revocation
- - membership updates are enforced instantly without time-consuming **re-encryption** of files.
- - a permission update only requires
- - one decryption of the corresponding ACL file.
- - one insert or update operation.
- - one encryption of the ACL.
-
-- Probabilistic authenticated encryption (PAE)
-Provide the confidentiality and integrity of content files, permissions, existing groups, and group memberships
-
-
-- Extensions
- - Deduplication
- - Introducing a third store
- - For each uploaded content file: calculate an HMAC over the file's content using the root key $SK_r$ (as the fingerprint)
- - **plaintext data is deduplicated and only a single copy is encrypted**
- - the enclave has access to the file key
- - Rollback protection for individual files/whole file system
- - Use a Merkle hash tree
- - Use multiset hashes to improve performance
- - use TEE's monotonic counter to record the version
-
-### Implementation and Evaluation
-- Implementation
- - C/C++ using the Intel SGX SDK
-
-- Evaluation
- - 1. latency to upload/download
- - 2. latency to add/revoke a user to/from his group
- - 3. latency to add/revoke a group permission
- - 4. latency to individual file rollback protection extension
- - 5. storage overhead
-
-## 2. Strength (Contributions of the paper)
-1. New architecture of end-to-end encrypted group file sharing system using a *server-side* Intel SGX.
-2. Support data deduplication, rollback protection, and separation of authentication and authorization.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-1. The drawback of client-side enclave
-the heterogeneity of end-user devices
-
-2. the point of implementing with SGX
-Given the memory and computational limitations of SGX enclaves (e.g., trusted computing base (TCB) size, trusted/untrusted transition latency), it is *far from trivial* to develop such a proxy service able to scale and sustain a high data throughput, considering dynamic access control operations.
-
-3. About Intel SGX background
-Intel SGX is an instruction set
-> 1. Memory isolation
-> 2. Attestation: allows to establish a secure channel between an external party and an enclave
->
-> > this secure channel can be used to deploy sensitive data (e.g., encryption keys) directly into the enclave.
->
-> 3. Data sealing
-> 4. Protected file system library : a library shipped with Intel SGX SDK, provides a subset of the file API, e.g., file creation, file writing, and file reading.
-> 5. Switchless calls: In SGX's SDK, calls into the enclave are replaced by writing tasks into an untrusted buffer and enclave worker threads asynchronously perform the task.
-
-
-4. About the category of file sharing systems
-> 1. Pure cryptographically protected file sharing systems
+---
+typora-copy-images-to: ../paper_figure
+---
+SeGShare: Secure Group File Sharing in the Cloud using Enclaves
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| DSN'20 | SGX Storage |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+
+- Present a new architecture for end-to-end encrypted, group-based file sharing using Intel SGX
+ - protect the confidentiality and integrity of all data and management files
+ - overhead is extremely small in computation and storage resources.
+
+- the drawbacks of current solutions
+ - For permission revocation: it is necessary to **re-encrypt** the file with a new key and distribute the new key to many users (*involves expensive cryptographic operations*)
+ - the users can gain plaintext access to the **file key**.
+
+- Key reason for throughput improvement
+ - does not require complex cryptographic operations on permission or membership changes.
+ - build an efficient SGX-enable TLS stack, use *switchless* enclave calls for all network and file traffic.
+
+
+### File Sharing System
+
+- Attacker Model
+ - CA is trusted: an attacker controlling multiple users should only have permissions according to the union of permissions of the individual controlled users.
+ - Malicious cloud provider: monitor/change data on disk or in memory, view all network communications
+ - do not protect the number of files, the file sizes, and the file access pattern
+- Group-based permission
+ - a user is a member of a group, and a group has some permissions for files
+ - main benefit: a membership update is sufficient to provide or revoke a user's access to many files instead of changing the permissions of all affected files individually.
+
+- System Design
+
+
+
+1. Setup: establish trust between users and enclave
+Establishes bilateral trust between each user $u \in U$ and the enclave running at the cloud provider.
+> Establish user trust in enclave
+> Establish enclave trust in users
+
+2. Access control
+This component is responsible to relation update and access control check
+> It uses the file manager components to *read* and *write* the required relations.
+> It is the key to enable dynamic groups without **re-encryption**.
+
+3. File managers
+It contains two parts:
+> **trusted file manager**: encrypt/decrypt the content of all files that should be written/read with *PAE_Enc/PAE_Dec* using a unique file key $SK_f$ per file. ($SK_f$ is derived from a root key $SK_r$)
+> **untrusted file manager**: passed/received all encrypted data, and handle the actual memory access.
+
+*Content store*: regular files and its corresponding ACL files
+*Group store*: group list files and member list files
+
+- Immediate revocation
+ - membership updates are enforced instantly without time-consuming **re-encryption** of files.
+ - a permission update only requires
+ - one decryption of the corresponding ACL file.
+ - one insert or update operation.
+ - one encryption of the ACL.
+
+- Probabilistic authenticated encryption (PAE)
+Provide the confidentiality and integrity of content files, permissions, existing groups, and group memberships
+
+
+- Extensions
+ - Deduplication
+ - Introducing a third store
+ - For each uploaded content file: calculate an HMAC over the file's content using the root key $SK_r$ (as the fingerprint)
+ - **plaintext data is deduplicated and only a single copy is encrypted**
+ - the enclave has access to the file key
+ - Rollback protection for individual files/whole file system
+ - Use a Merkle hash tree
+ - Use multiset hashes to improve performance
+ - use TEE's monotonic counter to record the version
+
+### Implementation and Evaluation
+- Implementation
+ - C/C++ using the Intel SGX SDK
+
+- Evaluation
+ - 1. latency to upload/download
+ - 2. latency to add/revoke a user to/from his group
+ - 3. latency to add/revoke a group permission
+ - 4. latency to individual file rollback protection extension
+ - 5. storage overhead
+
+## 2. Strength (Contributions of the paper)
+1. New architecture of end-to-end encrypted group file sharing system using a *server-side* Intel SGX.
+2. Support data deduplication, rollback protection, and separation of authentication and authorization.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. The drawback of client-side enclave
+the heterogeneity of end-user devices
+
+2. the point of implementing with SGX
+Given the memory and computational limitations of SGX enclaves (e.g., trusted computing base (TCB) size, trusted/untrusted transition latency), it is *far from trivial* to develop such a proxy service able to scale and sustain a high data throughput, considering dynamic access control operations.
+
+3. About Intel SGX background
+Intel SGX is an instruction set
+> 1. Memory isolation
+> 2. Attestation: allows to establish a secure channel between an external party and an enclave
+>
+> > this secure channel can be used to deploy sensitive data (e.g., encryption keys) directly into the enclave.
+>
+> 3. Data sealing
+> 4. Protected file system library : a library shipped with Intel SGX SDK, provides a subset of the file API, e.g., file creation, file writing, and file reading.
+> 5. Switchless calls: In SGX's SDK, calls into the enclave are replaced by writing tasks into an untrusted buffer and enclave worker threads asynchronously perform the task.
+
+
+4. About the category of file sharing systems
+> 1. Pure cryptographically protected file sharing systems
> 2. TEE-supported file sharing systems: NEXUS, Pesos
\ No newline at end of file
diff --git a/StoragePaperNote/SGX Storage/ShieldStore-EuroSys'19.md b/StoragePaperNote/Security/SGX-Storage/ShieldStore-EuroSys'19.md
old mode 100644
new mode 100755
similarity index 98%
rename from StoragePaperNote/SGX Storage/ShieldStore-EuroSys'19.md
rename to StoragePaperNote/Security/SGX-Storage/ShieldStore-EuroSys'19.md
index 78bed52..05bbcc2
--- a/StoragePaperNote/SGX Storage/ShieldStore-EuroSys'19.md
+++ b/StoragePaperNote/Security/SGX-Storage/ShieldStore-EuroSys'19.md
@@ -1,130 +1,130 @@
----
-typora-copy-images-to: ../paper_figure
----
-ShieldStore: Shielded In-memory Key-Value Storage with SGX
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| EuroSys'19 | SGX |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
- - the memory requirements of in-memory key-value store are far larger than the protected memory limit.
- - main data structures (e.g., pointers and keys) do not match well with the coarse-grained paging of the SGX memory extension technique.
- - SGX enables cloud users to run their applications **securely** on the *remote* cloud servers whose operating systems and hardware are exposed to potentially malicious remote attackers and staff.
-
-This paper wants to overcome the memory restriction.
-> maintain the main data structures in unprotected memory with *each key-value pair individually encrypted*
-> integrity-protected by its secure component running inside an enclave.
-
-### ShieldStore
-- SGX basis
- - one major limitation of SGX: the capacity of the protected memory region, enclave page cache (EPC). (128MB)
- - If enclave memory usage is larger than the EPC limit, some pages are *evicted* from EPC, and remapped to the *non-EPC* memory region with **page granularity** encryption and integrity protection.
- - Performance penalty: demand paging step which maps the page back to the secure region along with the eviction of another victim page.
- - The data in EPC are in plaintext only in on-chip caches, and they are encrypted and integrity-protected when are in the external memory.
- - Creating a huge Merkle tree for tens or hundreds of gigabytes of main memory at cacheline granularity will increase the integrity verification latency intolerably.
- - the effective EPC is smaller than the 128MB reserved region (in practice around 90MB) due to **security meta-data**.
- - Cost of crossing enclave boundary
- - an enclave cannot issue a system call (any system services require exiting the enclave), needs hardware operations such as *flushing TLB entires*.
-
-- Main idea
- - Instead of relying on the page-oriented enclave memory extension provided by SGX, it designs that fine-grained application-specific data protection. (maintains the majority of data structures in the non-enclave memory region)
- - Inside the enclave: encrypts each key-value pair.
- - Non-enclave memory: only place the encrypted and integrity-protected data.
-
-- Architecture
-
-
-
-1. The client remote-attests the storage server system
-
-verifying the SGX support of the processor, the code, and other critical memory state of an enclave.
-
-2. The client and storage server running in the enclave generate session keys an exchange them.
-
-3. The client sends operations through the secure channel using the exchanged session key.
-
-Clients do not directly access the ciphertexts on the server side
-> the server in the enclave will decrypt the retrieved data
-> encrypt them again with the session key used for the client, send the response to the client
-
-
-- The performance degradation of accessing enclave memory pages when the enclave memory size exceeds the EPC limit. (read and write)
- - Three types
- - No SGX: SGX disable
- - SGX enclave: the entire data resides in the enclave
- - SGX unprotected: access instructions are running inside the enclave, the data is in the unprotected memory region
- - reading or writing unprotected data from an enclave shows similar latencies to No SGX
-
-
-
-- Baseline Approach
-
-Employ a hash-based index structure, and use chaining to resolve collisions in hash-based index.
-> store the entire hash table in the enclave memory.
-> As the EPC region can cover only a small portion of the total database size, a data access can cause page eviction and demand paging between an EPC page and non-EPC page.
-
-- SheildStore Design
- - Overall architecture:
- - the main hash table storing key and data pair is placed in the **unprotected memory region**. Each data entry must be encrypted by ShieldStore in an enclave.
- - the encrypted data entry is retrieved from the main hash table in unprotected memory region. (decryption and integrity verification MAC)
- - Fine-grained Key-value encryption
- - MAC Hashing: attaches a 128-bit hash value created from *encrypted key/value, key/value sizes, key-index, and IV/counter,* to the data entry.
- - Integrity verification
- - create many in-enclave hashes
- - Each MAC hash value covers one or more buckets
- - Persistency Support (seal secure meta-data in the enclave)
- - fork a child process stores the key-value entries on a file. (initiated every 60 seconds)
- - write the encrypted key-value entries in unprotected memory are directly written to storage.
-
-- Optimizations
- - Extra Heap Allocator
- - Each call to the outside heap allocator requires a costly exit from the enclave.
- - Add custom allocator which runs inside the enclave, but allocates unprotected memory.
- - MAC bucketing
- - reduce the overhead of the MAC accesses through pointer chasing, each hash chain has its own MAC bucket (contains only the MAC fields of data entries of the hash bucket).
- - Multi-threading
- - assign each thread to an exclusive partition of hash keys
- - Searching encrypted key
- - use a small 1 byte key hint in the data entry
- - the key hint is a hash of the plaintext key stored in each data entry (leakage of 1 byte if hashed key information)
-
-
-### Implementation and Evaluation
-- Workloads
- - uniform workload: generate key with uniform distribution
- - zipf distribution of skewness 0.99 (used in YCSB)
-
-- Evaluation
- - Baseline: put the entire hash table in enclave memory
- - GrapheneSGX: Memcached + graphene
- - ShieldBase: without optimizations
- - ShieldOpt: final version
-
-1. Standalone evaluation
-Overall Performance
-Multi-core Scalability
-Effect of Optimizations
-Trade-offs in MAC hashes
-
-2. Comparison to Eleos
-
-3. Networked Evaluation
-a new extra overhead is added to ShieldStore
-> the cost of receiving requests and sending responses through the socket interfaces
-> using HotCalls
-
-
-## 2. Strength (Contributions of the paper)
-- Good experiments
-
-## 3. Weakness (Limitations of the paper)
-- SGX does not provide direct protection for such side channels
-- This paper also assumes communication between clients and ShieldStore is protected through encryption.
-- The hash-based index structure of ShieldStore does not support range query operations.
-
-## 4. Some Insights (Future work)
+---
+typora-copy-images-to: ../paper_figure
+---
+ShieldStore: Shielded In-memory Key-Value Storage with SGX
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| EuroSys'19 | SGX |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - the memory requirements of in-memory key-value store are far larger than the protected memory limit.
+ - main data structures (e.g., pointers and keys) do not match well with the coarse-grained paging of the SGX memory extension technique.
+ - SGX enables cloud users to run their applications **securely** on the *remote* cloud servers whose operating systems and hardware are exposed to potentially malicious remote attackers and staff.
+
+This paper wants to overcome the memory restriction.
+> maintain the main data structures in unprotected memory with *each key-value pair individually encrypted*
+> integrity-protected by its secure component running inside an enclave.
+
+### ShieldStore
+- SGX basis
+ - one major limitation of SGX: the capacity of the protected memory region, enclave page cache (EPC). (128MB)
+ - If enclave memory usage is larger than the EPC limit, some pages are *evicted* from EPC, and remapped to the *non-EPC* memory region with **page granularity** encryption and integrity protection.
+ - Performance penalty: demand paging step which maps the page back to the secure region along with the eviction of another victim page.
+ - The data in EPC are in plaintext only in on-chip caches, and they are encrypted and integrity-protected when are in the external memory.
+ - Creating a huge Merkle tree for tens or hundreds of gigabytes of main memory at cacheline granularity will increase the integrity verification latency intolerably.
+ - the effective EPC is smaller than the 128MB reserved region (in practice around 90MB) due to **security meta-data**.
+ - Cost of crossing enclave boundary
+ - an enclave cannot issue a system call (any system services require exiting the enclave), needs hardware operations such as *flushing TLB entires*.
+
+- Main idea
+ - Instead of relying on the page-oriented enclave memory extension provided by SGX, it designs that fine-grained application-specific data protection. (maintains the majority of data structures in the non-enclave memory region)
+ - Inside the enclave: encrypts each key-value pair.
+ - Non-enclave memory: only place the encrypted and integrity-protected data.
+
+- Architecture
+
+
+
+1. The client remote-attests the storage server system
+
+verifying the SGX support of the processor, the code, and other critical memory state of an enclave.
+
+2. The client and storage server running in the enclave generate session keys an exchange them.
+
+3. The client sends operations through the secure channel using the exchanged session key.
+
+Clients do not directly access the ciphertexts on the server side
+> the server in the enclave will decrypt the retrieved data
+> encrypt them again with the session key used for the client, send the response to the client
+
+
+- The performance degradation of accessing enclave memory pages when the enclave memory size exceeds the EPC limit. (read and write)
+ - Three types
+ - No SGX: SGX disable
+ - SGX enclave: the entire data resides in the enclave
+ - SGX unprotected: access instructions are running inside the enclave, the data is in the unprotected memory region
+ - reading or writing unprotected data from an enclave shows similar latencies to No SGX
+
+
+
+- Baseline Approach
+
+Employ a hash-based index structure, and use chaining to resolve collisions in hash-based index.
+> store the entire hash table in the enclave memory.
+> As the EPC region can cover only a small portion of the total database size, a data access can cause page eviction and demand paging between an EPC page and non-EPC page.
+
+- SheildStore Design
+ - Overall architecture:
+ - the main hash table storing key and data pair is placed in the **unprotected memory region**. Each data entry must be encrypted by ShieldStore in an enclave.
+ - the encrypted data entry is retrieved from the main hash table in unprotected memory region. (decryption and integrity verification MAC)
+ - Fine-grained Key-value encryption
+ - MAC Hashing: attaches a 128-bit hash value created from *encrypted key/value, key/value sizes, key-index, and IV/counter,* to the data entry.
+ - Integrity verification
+ - create many in-enclave hashes
+ - Each MAC hash value covers one or more buckets
+ - Persistency Support (seal secure meta-data in the enclave)
+ - fork a child process stores the key-value entries on a file. (initiated every 60 seconds)
+ - write the encrypted key-value entries in unprotected memory are directly written to storage.
+
+- Optimizations
+ - Extra Heap Allocator
+ - Each call to the outside heap allocator requires a costly exit from the enclave.
+ - Add custom allocator which runs inside the enclave, but allocates unprotected memory.
+ - MAC bucketing
+ - reduce the overhead of the MAC accesses through pointer chasing, each hash chain has its own MAC bucket (contains only the MAC fields of data entries of the hash bucket).
+ - Multi-threading
+ - assign each thread to an exclusive partition of hash keys
+ - Searching encrypted key
+ - use a small 1 byte key hint in the data entry
+ - the key hint is a hash of the plaintext key stored in each data entry (leakage of 1 byte if hashed key information)
+
+
+### Implementation and Evaluation
+- Workloads
+ - uniform workload: generate key with uniform distribution
+ - zipf distribution of skewness 0.99 (used in YCSB)
+
+- Evaluation
+ - Baseline: put the entire hash table in enclave memory
+ - GrapheneSGX: Memcached + graphene
+ - ShieldBase: without optimizations
+ - ShieldOpt: final version
+
+1. Standalone evaluation
+Overall Performance
+Multi-core Scalability
+Effect of Optimizations
+Trade-offs in MAC hashes
+
+2. Comparison to Eleos
+
+3. Networked Evaluation
+a new extra overhead is added to ShieldStore
+> the cost of receiving requests and sending responses through the socket interfaces
+> using HotCalls
+
+
+## 2. Strength (Contributions of the paper)
+- Good experiments
+
+## 3. Weakness (Limitations of the paper)
+- SGX does not provide direct protection for such side channels
+- This paper also assumes communication between clients and ShieldStore is protected through encryption.
+- The hash-based index structure of ShieldStore does not support range query operations.
+
+## 4. Some Insights (Future work)
- Reading unprotected memory from an enclave is as fast as no-SGX reads.
\ No newline at end of file
diff --git a/StoragePaperNote/SGX Storage/StorageDataPathSGX-arxiv.md b/StoragePaperNote/Security/SGX-Storage/StorageDataPathSGX-arxiv.md
old mode 100644
new mode 100755
similarity index 97%
rename from StoragePaperNote/SGX Storage/StorageDataPathSGX-arxiv.md
rename to StoragePaperNote/Security/SGX-Storage/StorageDataPathSGX-arxiv.md
index d5142e7..2d54292
--- a/StoragePaperNote/SGX Storage/StorageDataPathSGX-arxiv.md
+++ b/StoragePaperNote/Security/SGX-Storage/StorageDataPathSGX-arxiv.md
@@ -1,96 +1,96 @@
----
-typora-copy-images-to: ../paper_figure
----
-Securing the Storage Data Path with SGX Enclaves
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| arxiv | Storage SGX |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper intends to explore the use of SGX enclaves as a mean to improve the security of handling keys and data in storage systems.
-
-- Motivation
-this paper studies the performance of securing the data path and hardening the security of data-at-rest encryption in storage systems.
-> such as block storage systems, file systems or object storage.
-
-1. high throughput, comparable to running the same operation without enclaves.
-2. the subtleties arise from using SGX
-> how much development work is required in order to get acceptable
-performance?
-
-3. quantify the performance effect of the storage encryption use-case, and understand the subtleties involved that may affect a wide array of other use cases.
-
-### Method Name
-- SGX performance overhead
-1. the actual overhead of accessing the encrypted memory via the MEE.
-2. the overhead associated with entering and exiting an enclave.
-
-> ECALLs (defined in an "edl" file) has a performance impact due to the CPU's context switches.
-> the limitation of the EPC size, and when operating with memory that exceeds the EPC size there is a need for paging of EPC pages to regular memory. (additional latency, encryption of this data before it lands in regular memory)
-
-- Data-at-rest encryption
-either clear text or encrypted in transit (e.g., HTTPS), and then encrypted before being persisted to disk.
-> ensure that all persistent data is always encrypted, so that loss of hardware (e.g., malicious user, hardware failure), does not compromise the data.
-
-- Software encryption
-the encryption keys must reside in clear text in memory, presenting a significant security risk.
-> sensitive data keys are vulnerable to either privileged users or to memory sniffing techniques.
-
-
-- Protecting the keys using SGX
-By placing the software encryption process and key handling inside enclaves.
-> perform all encryption and decryption in SGX
-
-Enclaves are allowed to access the general memory, and so data buffers (both encrypted and cleartext) can reside outside of the enclaves memory.
-> call ECALL, gets pointers to two buffers in the general (non-encrypted) memory.
-
-
-
-- End-to-end Data protection
-Data-in-transit encryption is now required in many scenarios.
-> via one of the prevalent standards such asa HTTPS, SSL, TLS.
-
-
-
-### Implementation and Evaluation
-- Evaluation
-1. implement four variations of the function **find_max**.
-> ECALL, in, user_check
-> in: copy the data to enclave.
-> user_check: without copying it into the enclave's memory.
-
-2. data encryption (AES128-GCM)
-> trusted: sgxsdk, sgxssl
-> untrusted: openssl
-
-result: openssl > sgxssl > sgxsdk
-
-3. The effect of multiple threads
-> Multiple processes
-> Multiple threads with a single enclave
-> Multiple threads with separate enclaves
-
-
-
-## 2. Strength (Contributions of the paper)
-1. the most important thing mentioned by this paper is *without actual performance testing*, it is difficult to predict the performance of computations running inside an SGX enclave.
-> choosing buffer sizes, library configuration
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-1. In this paper, it mentions the **original concept** of SGX.
-created with less ambitious use cases in mind, in which minimal parts of the computation (and the code associated with it) are placed into enclaves.
-> only a saml part of the application needs to reside in an enclave.
-> data transformations, local test on data, and cryptographic functions on data blocks.
-
-2. It also mentions some workloads are bad fro SGX performance.
-> require frequent small random access operations
-> require a very large amount of data to be in encrypted memory simultaneously
-
-3. SGX basis
-**Memory encryption engine (MEE)**: perform real time encryption of all communication between the CPU and the memory.
-> MEE is only invoked on a special designated area of the memory called the **Enclave Page Cache (EPC)**. (128MB: 96MB after reducing space used for managing the enclaves).
+---
+typora-copy-images-to: ../paper_figure
+---
+Securing the Storage Data Path with SGX Enclaves
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| arxiv | Storage SGX |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper intends to explore the use of SGX enclaves as a mean to improve the security of handling keys and data in storage systems.
+
+- Motivation
+this paper studies the performance of securing the data path and hardening the security of data-at-rest encryption in storage systems.
+> such as block storage systems, file systems or object storage.
+
+1. high throughput, comparable to running the same operation without enclaves.
+2. the subtleties arise from using SGX
+> how much development work is required in order to get acceptable
+performance?
+
+3. quantify the performance effect of the storage encryption use-case, and understand the subtleties involved that may affect a wide array of other use cases.
+
+### Method Name
+- SGX performance overhead
+1. the actual overhead of accessing the encrypted memory via the MEE.
+2. the overhead associated with entering and exiting an enclave.
+
+> ECALLs (defined in an "edl" file) has a performance impact due to the CPU's context switches.
+> the limitation of the EPC size, and when operating with memory that exceeds the EPC size there is a need for paging of EPC pages to regular memory. (additional latency, encryption of this data before it lands in regular memory)
+
+- Data-at-rest encryption
+either clear text or encrypted in transit (e.g., HTTPS), and then encrypted before being persisted to disk.
+> ensure that all persistent data is always encrypted, so that loss of hardware (e.g., malicious user, hardware failure), does not compromise the data.
+
+- Software encryption
+the encryption keys must reside in clear text in memory, presenting a significant security risk.
+> sensitive data keys are vulnerable to either privileged users or to memory sniffing techniques.
+
+
+- Protecting the keys using SGX
+By placing the software encryption process and key handling inside enclaves.
+> perform all encryption and decryption in SGX
+
+Enclaves are allowed to access the general memory, and so data buffers (both encrypted and cleartext) can reside outside of the enclaves memory.
+> call ECALL, gets pointers to two buffers in the general (non-encrypted) memory.
+
+
+
+- End-to-end Data protection
+Data-in-transit encryption is now required in many scenarios.
+> via one of the prevalent standards such asa HTTPS, SSL, TLS.
+
+
+
+### Implementation and Evaluation
+- Evaluation
+1. implement four variations of the function **find_max**.
+> ECALL, in, user_check
+> in: copy the data to enclave.
+> user_check: without copying it into the enclave's memory.
+
+2. data encryption (AES128-GCM)
+> trusted: sgxsdk, sgxssl
+> untrusted: openssl
+
+result: openssl > sgxssl > sgxsdk
+
+3. The effect of multiple threads
+> Multiple processes
+> Multiple threads with a single enclave
+> Multiple threads with separate enclaves
+
+
+
+## 2. Strength (Contributions of the paper)
+1. the most important thing mentioned by this paper is *without actual performance testing*, it is difficult to predict the performance of computations running inside an SGX enclave.
+> choosing buffer sizes, library configuration
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+1. In this paper, it mentions the **original concept** of SGX.
+created with less ambitious use cases in mind, in which minimal parts of the computation (and the code associated with it) are placed into enclaves.
+> only a saml part of the application needs to reside in an enclave.
+> data transformations, local test on data, and cryptographic functions on data blocks.
+
+2. It also mentions some workloads are bad fro SGX performance.
+> require frequent small random access operations
+> require a very large amount of data to be in encrypted memory simultaneously
+
+3. SGX basis
+**Memory encryption engine (MEE)**: perform real time encryption of all communication between the CPU and the memory.
+> MEE is only invoked on a special designated area of the memory called the **Enclave Page Cache (EPC)**. (128MB: 96MB after reducing space used for managing the enclaves).
diff --git a/StoragePaperNote/Security/SGX-Technique/SGXPerf-Middleware'18.md b/StoragePaperNote/Security/SGX-Technique/SGXPerf-Middleware'18.md
new file mode 100755
index 0000000..f3266b1
--- /dev/null
+++ b/StoragePaperNote/Security/SGX-Technique/SGXPerf-Middleware'18.md
@@ -0,0 +1,92 @@
+---
+typora-copy-images-to: ../paper_figure
+---
+sgx-perf: A Performance Analysis Tool for Intel SGX Enclaves
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| Middleware'18 | SGX performance |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - understanding the *performance implications* of SGX and the offered programming support is still in its infancy.
+ - time-consuming trial-and-error testing and poses the risk of poor performance.
+ - current methods (e.g., VTune) cannot support empowering users to implement well-performing applications.
+
+### SGX-Perf
+
+- SGX architecture
+ - Trusted Runtime System (TRST) and Untrusted Runtime System (URTS) handle the **enclave transitions** and **call dispatching**.
+ - Enclave performance considerations
+ - enclave transitions: 8,600 and 14,000 cycles
+ - need to reduce the number of enclave transitions
+ - in-enclave synchronization
+ - as sleeping is not possible inside enclave,
+ - in-enclave synchronization primitives provided by the SGX SDK implement additional **ocalls** to sleep outside the enclave.
+ - enclave paging
+ - EPC: one metadata page, its code, the heap and a thread-data page (TCS), stack and state-save-area (SSA) pages for each configured enclave thread.
+ - added enclave transitions to handle page faults
+ - extra computation needed for cryptographic operations
+
+
+
+- SGX problems and solutions
+ - The overhead of using enclaves
+ - the number of enclave transitions during execution and their duration
+ - the number of paging events: page fault + encryption
+ - Reducing the number of enclave transitions should be **prioritized**.
+ - Design trade-offs
+ - TCB size and security issue
+ - **reordering** should be first considered in the performance optimization.
+
+
+
+- sgx-perf design
+ - tracing ecalls and ocalls: change the symbols of wrapper codes.
+ - logger
+ - tracing in-enclave synchronization
+
+### Implementation and Evaluation
+- Evaluation
+ - Key question:
+ - what is the overhead of running an application with sgx-perf?
+ - the overhead of logger
+ - can sgx-perf detect optimization opportunities in systems that use Intel SGX?
+ - evaluation application
+ - TaLos: crypto library
+ - SecureKeeper: a key-value store
+ - SQLite
+ - LibreSSL partitioned with Glamdring
+ - automated partition code
+
+1. performance overhead of logging
+2. optimization of enclaves
+
+
+## 2. Strength (Contributions of the paper)
+1. summary identified performance critical factors of SGX
+2. present sgx-perf, a collection of tools for high-level dynamic performance analysis of SGX-based application.
+> perform fine-grained profiling of performance critical events in enclave
+> also provide recommendations on how to improve enclave performance
+
+3. show how to use sgx-perf to improve the SGX-based application performance.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
+1. how to optimize the enclave performance?
+> asynchronous calls
+> extended memory management support
+
+2. EPC protection
+All enclave memory is **fully and transparently** encrypted as well as integrity protected.
+
+3. Security enhancements
+It is necessary to reduce the attack surface of their interfaces.
+> public ecall: can always be called
+> private ecall: can only be called during an ocall
+>
+> > the attacker may change the control path of the execution of the program and gain access to enclave secrets.
diff --git a/StoragePaperNote/Security/SGX-Technique/talos-arxiv'17.md b/StoragePaperNote/Security/SGX-Technique/talos-arxiv'17.md
new file mode 100755
index 0000000..e3a60e4
--- /dev/null
+++ b/StoragePaperNote/Security/SGX-Technique/talos-arxiv'17.md
@@ -0,0 +1,88 @@
+---
+typora-copy-images-to: ../paper_figure
+---
+TaLoS: Secure and Transparent TLS Termination inside SGX Enclaves
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| arxiv'17 | SGX communication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+ - design a drop-in replacement for existing transport layer security (TLS) libraries that protects itself from a malicious environment
+ - running inside an Intel SGX trusted execution environment.
+
+- Main idea
+ - TaLoS protects *private keys* and *session keys* from a malicious environment.
+
+- Main goals
+ - security and privacy: resilient to different threats
+ - ease-of-deployment: be easy to deploy with existing applications
+ - no changes to the client implementations
+ - performance overhead: impose a low performance overhead with respect to native application execution.
+
+### TaLoS
+
+- Intel SGX background
+ - enclave code is permitted to access enclave and non-enclave memory.
+ - when EPC is flushed to DRAM or disk, they are **transparently** encrypted and integrity protected by on-chip memory encryption engine.
+
+- TaLoS TLS termination
+ - place the following parts inside the enclave:
+ - private keys
+ - session keys
+ - code related to the TLS protocol implementation (SSL_read(), SSL_write())
+ - non-sensitive code and data are placed outside of the enclave for performance reasons.
+
+
+
+- Enclave TLS implementation
+ - Secure callbacks: use ocalls rather than regular function calls.
+ - store the address of the callback function
+ - generate a trampoline function to perform an ocall into the outside application code
+ - Shadowing (shadow data structure)
+ - TaLoS maintains a sanitized copy of the SSL structure outside the enclave, with all sensitive data removed.
+
+- Reducing enclave transitions
+ 1. use a pre-allocated memory pool to manager the small objects
+ - Instead of performing ocalls to allocate non-sensitive objects from within the enclave.
+ 2. avoids ocalls to pthread by using the thread locks implementation provided by the SGX SDK
+ 3. use SGX random number generator inside the enclave to avoid ocalls to the random system call.
+ 4. storing application-specific data written to TLS data structures outside the enclave
+ - reduce the number of ecalls
+
+- Reducing transition overhead
+
+ - asynchronous enclave transitions
+
+
+
+
+
+### Implementation and Evaluation
+
+- Implementation
+ - TaLoS exposes 205 ecalls and 55 ocalls
+ - 282,200 LoC
+
+- Evaluation
+ - Configuration:
+ - Using Apache and Squid
+ - Enclave TLS overhead
+ - CPU is the bottleneck
+ - TaLoS incurs a 23% performance overhead
+ - Impact of asynchronous calls
+ - increase the performance by at least 57%
+ - Scalability
+ - increase the number of CPU cores
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
+1. The difference between the SGX-SSL
+Both SGX-SSL and mBedTLS-SGX require the entire application to execute inside the SGX enclave.
+
diff --git a/StoragePaperNote/Security/Secret Sharing/AONT-RS-FAST'11.md b/StoragePaperNote/Security/Secret-Sharing/AONT-RS-FAST'11.md
old mode 100644
new mode 100755
similarity index 98%
rename from StoragePaperNote/Security/Secret Sharing/AONT-RS-FAST'11.md
rename to StoragePaperNote/Security/Secret-Sharing/AONT-RS-FAST'11.md
index f4b7b52..bcd4e95
--- a/StoragePaperNote/Security/Secret Sharing/AONT-RS-FAST'11.md
+++ b/StoragePaperNote/Security/Secret-Sharing/AONT-RS-FAST'11.md
@@ -1,62 +1,62 @@
----
-typora-copy-images-to: paper_figure
----
-AONT-RS: Blending Security and Performance in Dispersed Storage Systems
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| FAST'11 | Secret Sharing |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper aims to modify the original Rabin's scheme that it can achieve improved computational performance, security and integrity.
-> Main Idea: combining All-Or-Nothing Transform (AONT) with systematic RS erasure codes.
-
-The main problem: how to provide security without securely storing encryption keys.
-Related Work:
-> Information dispersal algorithm (IDA)
-> Secret Sharing Made Short (SSMS)
-
-### AONT-RS
-Aiming to provide the **computationally secure** secret sharing scheme.
-- Enrich Rabin's IDA with two respects:
-1. employ a variant of All-or-nothing Transform (AONT) as a **preprocessing** pass over the data.
-> AONT: can be viewed as a (s+1, s+1) threshold scheme $\rightarrow$ data composed of $s$ words is encoded into $s+1$ different words
-> original workflow of AONT: A random key $K$ is chosen, and each codeword $c_i$ is calculated as: $c_i = d_i \oplus E(K, i+1)$
-> It adds a **canary** $d_s$, which is fixed value. (for checking integrity), then generates a hash $h$ of the $s+1$ codewords using a standard hashing function. Then calculate the final block $c_{s+1}$ as: $c_{s+1} = K \oplus h$.
-
-2. employ a **systematic** erasure code instead of non-systematic one. To improve the performance, because it eliminates the need to encode the first $k$ codewords. (as well as of the process of decoding)
-
-- Encoding process:
-
-**difference**: The hash value and random key are combined via bitwise exclusive-or to form a difference, which is append to the encrypted data to form the AONT package.
-
-- Decoding process:
-
-**Get the key**: first compute the hash $h$, then $(K \oplus h) \oplus h = K$
-
-
-- Security Evaluation
-AONT-RS has the property that unless one has all of the encrypted data, one cannot decode any of it.
-> one needs all of the data to discover $K$. And one cannot decode any of the data without $K$.
-> For the encoding function, it guarantees that enumeration is the only way to discover $K$'s value. $\rightarrow$ **computational security**.
-
-### Implementation and Evaluation
-- Using Cauchy Reed-Solomon coding for the multiplication and can improve performance significantly.
-- Benchmark Performance
-> 1. all benchmark are performed in using a **single thread**.
-> 2. the time equals the sum of the time to encrypt plus the time to hash.
-> > Fast AONT: SHA-256, MD5
-> > Secure AONT: AES-256, RC4-128
-
-
-Accesser: performs the **block-to-slice** encoding and decoding
-- Clients had 10 Gbps NICs, servers had 1 Gbps NICs, bottleneck is **CPU**.
-## 2. Strength (Contributions of the paper)
-1. This paper provides a new dispersal algorithm which can provide security without the need for a separate key management system.
-2. It also conducts the theoretical and actual performance analysis.
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+AONT-RS: Blending Security and Performance in Dispersed Storage Systems
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| FAST'11 | Secret Sharing |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper aims to modify the original Rabin's scheme that it can achieve improved computational performance, security and integrity.
+> Main Idea: combining All-Or-Nothing Transform (AONT) with systematic RS erasure codes.
+
+The main problem: how to provide security without securely storing encryption keys.
+Related Work:
+> Information dispersal algorithm (IDA)
+> Secret Sharing Made Short (SSMS)
+
+### AONT-RS
+Aiming to provide the **computationally secure** secret sharing scheme.
+- Enrich Rabin's IDA with two respects:
+1. employ a variant of All-or-nothing Transform (AONT) as a **preprocessing** pass over the data.
+> AONT: can be viewed as a (s+1, s+1) threshold scheme $\rightarrow$ data composed of $s$ words is encoded into $s+1$ different words
+> original workflow of AONT: A random key $K$ is chosen, and each codeword $c_i$ is calculated as: $c_i = d_i \oplus E(K, i+1)$
+> It adds a **canary** $d_s$, which is fixed value. (for checking integrity), then generates a hash $h$ of the $s+1$ codewords using a standard hashing function. Then calculate the final block $c_{s+1}$ as: $c_{s+1} = K \oplus h$.
+
+2. employ a **systematic** erasure code instead of non-systematic one. To improve the performance, because it eliminates the need to encode the first $k$ codewords. (as well as of the process of decoding)
+
+- Encoding process:
+
+**difference**: The hash value and random key are combined via bitwise exclusive-or to form a difference, which is append to the encrypted data to form the AONT package.
+
+- Decoding process:
+
+**Get the key**: first compute the hash $h$, then $(K \oplus h) \oplus h = K$
+
+
+- Security Evaluation
+AONT-RS has the property that unless one has all of the encrypted data, one cannot decode any of it.
+> one needs all of the data to discover $K$. And one cannot decode any of the data without $K$.
+> For the encoding function, it guarantees that enumeration is the only way to discover $K$'s value. $\rightarrow$ **computational security**.
+
+### Implementation and Evaluation
+- Using Cauchy Reed-Solomon coding for the multiplication and can improve performance significantly.
+- Benchmark Performance
+> 1. all benchmark are performed in using a **single thread**.
+> 2. the time equals the sum of the time to encrypt plus the time to hash.
+> > Fast AONT: SHA-256, MD5
+> > Secure AONT: AES-256, RC4-128
+
+
+Accesser: performs the **block-to-slice** encoding and decoding
+- Clients had 10 Gbps NICs, servers had 1 Gbps NICs, bottleneck is **CPU**.
+## 2. Strength (Contributions of the paper)
+1. This paper provides a new dispersal algorithm which can provide security without the need for a separate key management system.
+2. It also conducts the theoretical and actual performance analysis.
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
This paper does not mention how to choose the configuration parameter during the process so as to better policy decisions. This would depends on the characteristic of the targeted workload.
\ No newline at end of file
diff --git a/StoragePaperNote/Security/Secret Sharing/SSSS-ACMCommunication'79.md b/StoragePaperNote/Security/Secret-Sharing/SSSS-ACMCommunication'79.md
old mode 100644
new mode 100755
similarity index 97%
rename from StoragePaperNote/Security/Secret Sharing/SSSS-ACMCommunication'79.md
rename to StoragePaperNote/Security/Secret-Sharing/SSSS-ACMCommunication'79.md
index 0af1155..b7999c3
--- a/StoragePaperNote/Security/Secret Sharing/SSSS-ACMCommunication'79.md
+++ b/StoragePaperNote/Security/Secret-Sharing/SSSS-ACMCommunication'79.md
@@ -1,41 +1,41 @@
----
-typora-copy-images-to: paper_figure
----
-How to Share a Secret
-------------------------------------------
-| Venue | Category |
-| :----------------------: | :----------------------: |
-| Communication of the ACM 1979 | Secret Sharing Algorithm |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Goal: design a convenient secret sharing algorithm $(k, n)$ *threshold scheme*.
-### Shamir Secret Sharing
-Based on **polynimial interpolation**, for a $(k, n)$ scheme, and data $D$
-1. pick a random $k - 1$ degree polynomial,
-$$
-q(x) = a_0 + a_1x + a_2x^2 + a_3x^3 + ...... + a_{k-1}x^{k-1}
-$$
-2. Generate $n$ shares
-$$
-a_0 = D, D_1 = q(1), D_2 = q(2), ......, D_n = q(n)
-$$
-3. Given any subset of $k$ of there $D_i$ values, with their identifying indices, it can calculate the coefficients of $q(x)$ by interpolation, and then evaluate $a_0 = q(0) = D$
-
-The efficiency of polynomial evaluation and interpolation
-> $O(nlog^2n)$
-
-- To make the claim more precise, it can use **modular arithmetic** instead of real arithmetic.
-
-- If $D$ is long, it is advisable to break it into shorter blocks of bits (can be handled separately.)
-
-## 2. Strength (Contributions of the paper)
-1. When $k$ is kept fixed, $D_i$ pieces can be dynamically added or deleted *without affecting the other $D_i$ pieces*.
-
-2. It is easy to change the $D_i$ pieces without changing the original data $D$.
-
-## 3. Weakness (Limitations of the paper)
-1. The storage overhead (Storage space blow up) is very large.
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+How to Share a Secret
+------------------------------------------
+| Venue | Category |
+| :----------------------: | :----------------------: |
+| Communication of the ACM 1979 | Secret Sharing Algorithm |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Goal: design a convenient secret sharing algorithm $(k, n)$ *threshold scheme*.
+### Shamir Secret Sharing
+Based on **polynimial interpolation**, for a $(k, n)$ scheme, and data $D$
+1. pick a random $k - 1$ degree polynomial,
+$$
+q(x) = a_0 + a_1x + a_2x^2 + a_3x^3 + ...... + a_{k-1}x^{k-1}
+$$
+2. Generate $n$ shares
+$$
+a_0 = D, D_1 = q(1), D_2 = q(2), ......, D_n = q(n)
+$$
+3. Given any subset of $k$ of there $D_i$ values, with their identifying indices, it can calculate the coefficients of $q(x)$ by interpolation, and then evaluate $a_0 = q(0) = D$
+
+The efficiency of polynomial evaluation and interpolation
+> $O(nlog^2n)$
+
+- To make the claim more precise, it can use **modular arithmetic** instead of real arithmetic.
+
+- If $D$ is long, it is advisable to break it into shorter blocks of bits (can be handled separately.)
+
+## 2. Strength (Contributions of the paper)
+1. When $k$ is kept fixed, $D_i$ pieces can be dynamically added or deleted *without affecting the other $D_i$ pieces*.
+
+2. It is easy to change the $D_i$ pieces without changing the original data $D$.
+
+## 3. Weakness (Limitations of the paper)
+1. The storage overhead (Storage space blow up) is very large.
+
+## 4. Future Works
diff --git a/StoragePaperNote/Security/Secret Sharing/SecretSharing-Summary.md b/StoragePaperNote/Security/Secret-Sharing/SecretSharing-Summary.md
old mode 100644
new mode 100755
similarity index 96%
rename from StoragePaperNote/Security/Secret Sharing/SecretSharing-Summary.md
rename to StoragePaperNote/Security/Secret-Sharing/SecretSharing-Summary.md
index aa23393..e624a61
--- a/StoragePaperNote/Security/Secret Sharing/SecretSharing-Summary.md
+++ b/StoragePaperNote/Security/Secret-Sharing/SecretSharing-Summary.md
@@ -1,34 +1,34 @@
----
-typora-copy-images-to: paper_figure
----
-Secret Sharing Algorithm Summary
-------------------------------------------
-
-[TOC]
-
-## Summary
-- Comparison of Secret Sharing Algorithms
-
- | Algorithm | Confidentiality Degree | Storage Blowup |
- | --------- | ---------------------- | -------------- |
- | SSSS | $r=k-1$ | $n$ |
- | IDA | $r=0$ | $\frac{n}{k}$ |
- | RSSS | $r \in [0, k-1]$ | $\frac{n}{k-r}$ |
- | SSMS | $r = k -1$ | $\frac{n}{k} + n \times \frac{S_{key}}{S_{sec}}$ |
- | AONT-RS | $r = k -1$ | $\frac{n}{k} + n \times \frac{S_{key}}{S_{sec}}$ |
-
-Formally, a secret sharing algorithm is based on three parameters $(n, k, r)$ where $n > k > r \geq 0$,
-> $n, k$ define the fault tolerance degree of a secret.
-> $r$ defines the confidentiality degree of a secret.
-
-Storage blowup: the ratio of of the total size of $n$ shares to the size of the original size of the original secret.
-
-> the storage blowup must be at least $\frac{n}{k}$.'
-
-## How to Share a Secret (SSSS)
-
-
-
-
-
-
+---
+typora-copy-images-to: paper_figure
+---
+Secret Sharing Algorithm Summary
+------------------------------------------
+
+[TOC]
+
+## Summary
+- Comparison of Secret Sharing Algorithms
+
+ | Algorithm | Confidentiality Degree | Storage Blowup |
+ | --------- | ---------------------- | -------------- |
+ | SSSS | $r=k-1$ | $n$ |
+ | IDA | $r=0$ | $\frac{n}{k}$ |
+ | RSSS | $r \in [0, k-1]$ | $\frac{n}{k-r}$ |
+ | SSMS | $r = k -1$ | $\frac{n}{k} + n \times \frac{S_{key}}{S_{sec}}$ |
+ | AONT-RS | $r = k -1$ | $\frac{n}{k} + n \times \frac{S_{key}}{S_{sec}}$ |
+
+Formally, a secret sharing algorithm is based on three parameters $(n, k, r)$ where $n > k > r \geq 0$,
+> $n, k$ define the fault tolerance degree of a secret.
+> $r$ defines the confidentiality degree of a secret.
+
+Storage blowup: the ratio of of the total size of $n$ shares to the size of the original size of the original secret.
+
+> the storage blowup must be at least $\frac{n}{k}$.'
+
+## How to Share a Secret (SSSS)
+
+
+
+
+
+
diff --git a/StoragePaperNote/Security/ShareBigSecret-SYSTOR'18.md b/StoragePaperNote/Security/ShareBigSecret-SYSTOR'18.md
old mode 100644
new mode 100755
index 654f416..be863af
--- a/StoragePaperNote/Security/ShareBigSecret-SYSTOR'18.md
+++ b/StoragePaperNote/Security/ShareBigSecret-SYSTOR'18.md
@@ -1,104 +1,104 @@
----
-typora-copy-images-to: ../paper_figure
----
-How to Best Share a Big Secret
-------------------------------------------
-| Venue | Category |
-| :-----: | :------------------: |
-| SYSTOR'18 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-
-### Motivation of this paper
-- Motivation
- - Data-protection approaches are considered computationally expensive.
- - data encryption and secret-sharing
- - This paper intends to present the first end-to-end comparison of state-of-art encryption-based and secret sharing data protection approaches.
- - previous evaluation results do not provide a clear picture of how these schemes compare in terms of application-perceived read and write throughput.
- - the combination of a random number generator, erasure code, and encryption algorithm
- - determine which one becomes the bottleneck
-
-### How to best share a big secret
-
-
-
-
-
-- Encrypt files at the client side
- - Key-based encryption: (provide computational security)
- - encryption is considered computational expensive.
- - require key generation and management
- - hardware accelerators
-
-- Secret sharing: (cloud of clouds, provide information-theoretic security)
- - combines the user's original data with redundant random data
- - the original data can only be decoded by obtaining all of the encoded pieces.
- - Without requiring maintenance of encryption keys.
- - Drawback:
- - generate a large amount of random data.
- - incurs significant storage overhead
-
-- information-theoretic security vs. computational security
- - information-theoretic security $>$ computational security
-
-- To eliminate two major bottlenecks of data protection
- - Secure RAID: computational overhead is comparable to that of erasure coding.
- - Hardware cryptographic libraries.
-
-- Evaluate all stages of the data path:
- - Random data generation
- - true random: external noise or hardware interrupts: /dev/random
- - very slow
- - CSPRNG: whose seed must be generated by a true random generator: /dev/urandom
- - Encoding and encryption overheads
- - Overall throughput
-
-- Comparison
- - Reed-Solomon (RS): only provide fault tolerance.
- - Encryption: encrypt the data with a key-based symmetric cypher and encodes the result with RS.
- - AONT-RS: hash the encrypted data, combines the result with the encryption key, and encodes the entire package with RS.
- - Shamir's secret-sharingf scheme (SS): combine security and fault tolerance in non-systemic encoding
- - Secure RAID: combine security and fault tolerance in two encoding rounds based on RS.
-
-- Computational overhead
- - the low throughput of true random data generation precludes information-theoretical security in real system
- - the performance of AONT-RS is limited by the crytographic hash
- - secure RAID eliminates the computational bottleneck of secret sharing.
-
-- For End-To-End Evaluation
- - Aim: to understand the effect of the various **system-level** parameters on the bottlenecks.
- - storage and network bandwidth
- - Setting: multi-cloud (EC2), a distributed object store prototype. re-implement all schemes in Java.
- - Once storage and network bottlenecks are introduced, secret-sharing is outperformed by encryption based techniques
- - additional I/O and transfer overhead
- - bottleneck shift
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-1. This work mentions information-theoretical security is infeasible in real system implementations
-> high cost of true random data generation.
-
-2. The bottleneck in real implementations shifts from
-> 1. computational complexity
-> 2. storage throughput (on local storage)
-> 3. network bandwidth (in cloud deployments)
-
-3. This paper also suggests that encrypting the data and dispersing the keys with an efficient secret sharing scheme is optimal for multi-cloud environments
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
-1. Security issue in cloud
-Majority of cloud service providers do not specify in their terms of service that data is owned by the customers, and lack security mechanisms to protect it.
-> data leakage from the cloud have been recently documented.
-
-2. drawback of the single cloud
-> vendor lock-in
-> outage that a single cloud provider might suffer
-
-3. Data encryption
-AES: fixed-length string: 128 bits
-> AES-128: key size is 128 bits
-> AES-256: key size is 256 bits
+---
+typora-copy-images-to: ../paper_figure
+---
+How to Best Share a Big Secret
+------------------------------------------
+| Venue | Category |
+| :-----: | :------------------: |
+| SYSTOR'18 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+
+### Motivation of this paper
+- Motivation
+ - Data-protection approaches are considered computationally expensive.
+ - data encryption and secret-sharing
+ - This paper intends to present the first end-to-end comparison of state-of-art encryption-based and secret sharing data protection approaches.
+ - previous evaluation results do not provide a clear picture of how these schemes compare in terms of application-perceived read and write throughput.
+ - the combination of a random number generator, erasure code, and encryption algorithm
+ - determine which one becomes the bottleneck
+
+### How to best share a big secret
+
+
+
+
+
+- Encrypt files at the client side
+ - Key-based encryption: (provide computational security)
+ - encryption is considered computational expensive.
+ - require key generation and management
+ - hardware accelerators
+
+- Secret sharing: (cloud of clouds, provide information-theoretic security)
+ - combines the user's original data with redundant random data
+ - the original data can only be decoded by obtaining all of the encoded pieces.
+ - Without requiring maintenance of encryption keys.
+ - Drawback:
+ - generate a large amount of random data.
+ - incurs significant storage overhead
+
+- information-theoretic security vs. computational security
+ - information-theoretic security $>$ computational security
+
+- To eliminate two major bottlenecks of data protection
+ - Secure RAID: computational overhead is comparable to that of erasure coding.
+ - Hardware cryptographic libraries.
+
+- Evaluate all stages of the data path:
+ - Random data generation
+ - true random: external noise or hardware interrupts: /dev/random
+ - very slow
+ - CSPRNG: whose seed must be generated by a true random generator: /dev/urandom
+ - Encoding and encryption overheads
+ - Overall throughput
+
+- Comparison
+ - Reed-Solomon (RS): only provide fault tolerance.
+ - Encryption: encrypt the data with a key-based symmetric cypher and encodes the result with RS.
+ - AONT-RS: hash the encrypted data, combines the result with the encryption key, and encodes the entire package with RS.
+ - Shamir's secret-sharingf scheme (SS): combine security and fault tolerance in non-systemic encoding
+ - Secure RAID: combine security and fault tolerance in two encoding rounds based on RS.
+
+- Computational overhead
+ - the low throughput of true random data generation precludes information-theoretical security in real system
+ - the performance of AONT-RS is limited by the crytographic hash
+ - secure RAID eliminates the computational bottleneck of secret sharing.
+
+- For End-To-End Evaluation
+ - Aim: to understand the effect of the various **system-level** parameters on the bottlenecks.
+ - storage and network bandwidth
+ - Setting: multi-cloud (EC2), a distributed object store prototype. re-implement all schemes in Java.
+ - Once storage and network bottlenecks are introduced, secret-sharing is outperformed by encryption based techniques
+ - additional I/O and transfer overhead
+ - bottleneck shift
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+1. This work mentions information-theoretical security is infeasible in real system implementations
+> high cost of true random data generation.
+
+2. The bottleneck in real implementations shifts from
+> 1. computational complexity
+> 2. storage throughput (on local storage)
+> 3. network bandwidth (in cloud deployments)
+
+3. This paper also suggests that encrypting the data and dispersing the keys with an efficient secret sharing scheme is optimal for multi-cloud environments
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
+1. Security issue in cloud
+Majority of cloud service providers do not specify in their terms of service that data is owned by the customers, and lack security mechanisms to protect it.
+> data leakage from the cloud have been recently documented.
+
+2. drawback of the single cloud
+> vendor lock-in
+> outage that a single cloud provider might suffer
+
+3. Data encryption
+AES: fixed-length string: 128 bits
+> AES-128: key size is 128 bits
+> AES-256: key size is 256 bits
diff --git a/StoragePaperNote/Security/Splinter-NSDI'17.md b/StoragePaperNote/Security/Splinter-NSDI'17.md
old mode 100644
new mode 100755
index af1114e..4d1b15c
--- a/StoragePaperNote/Security/Splinter-NSDI'17.md
+++ b/StoragePaperNote/Security/Splinter-NSDI'17.md
@@ -1,137 +1,137 @@
----
-typora-copy-images-to: ../paper_figure
----
-Splinter: Practical Private Queries on Public Data
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| NSDI'17 | Function Secret Sharing |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-Many online services let users query large public datasets.
-> any user can query the data, and the datasets themselves are not sensitive.
-
-Malicious servers can infer the user privacy via the user query. (privacy leakage)
-
-- State-of-art limitations
-Previous private query systems have generally not achieved practical performance.
-> expensive cryptographic primitives and protocols.
-> high computational cost
-
-- Goal
-This paper intends to design a system that protects user's queries on public datasets while achieving **practical performance for many current web applications**.
-
-### Splinter
-- Main idea:
-The client splits each user query into shares and sends them to multiple providers.
-> It then combines their results to the final answer.
-> The user's query remains private as long as any one provider is honest.
-
-
-
-- Security goal
-1. hide sensitive parameters in a user's query.
-> query parameter and result
-
-2. does not protect against providers returning incorrect results or maliciously modifying the dataset.
-
-- Function Secret Sharing
-a client divides a function $f$ into function shares $f_1,f_2,......,f_k$ so that multiple parties can help evaluate $f$ without learning certain of its parameters.
-
-Practical FSS protocols only exists for **point** and **interval** functions.
-> Additivity of FSS: low communication costs.
-
-- Executing Splinter queries
- - Splinter client builds function shares.
- - Client sends the query with all the secret parameters removed to each provider, along with that provider's share of the condition function.
- - each provider divides its data into groups using grouping expressions.
- - the provider runs an evaluation protocol that depends on the aggregate function and on properties of the condition.
-
-Splinter query algorithm for aggerates depends on condition type.
-1. Condition Types and Classes
-> 1. single-value conditions: can only be true on one combination of the values of $(e_1, ..., e_t)$ (*Equality-only conditions*)
-> 2. interval conditions: condition is true on an interval of the range values ()
-> 3. Disjoint conditions: (*OR*)
-
-2. Aggregate Evaluation
-> 1. sum-based aggregates (*SUM* and *COUNT*)
-> 2. MAX and MIN
-> 3. TOPK
-
-
-In MAX MIN, and TOPK,
-
-**Complexity Summary**:
-
-
-
-In all cases, the computation time is $O(nlogn)$ and the communication costs are much smaller than the size of the database.
-
-
-- Optimized FSS Implementation
-Splinter includes an FSS implementation optimized for modern hardware.
-
-1. using one-way compression functions
-Splinter uses the Matyas-Meyer-Oseas one-way function
-> for fixed key cipher
-
-Splinter initializes the cipher at the beginning of the query and reuses it for the rest of the query
-> avoiding expensive AES initialization operations in the FSS protocol.
-
-2. select the correct multi-party FSS protocol
-Splinter proposes two different schemes
-> offer different trade-offs between bandwidth and CPU usage.
-
-**Multi-Party FSS with one-way functions**: the provider responds with all the records that match for a particular user-provided condition, and the client performs aggregation locally.
-> provide the fastest response times on low-latency networks
-> it is not suitable for the bandwidth-sensitive environments
-
-**Multi-Party FSS with Paillier**: has the same additive properties as the two-party FSS protocol.
-> the size of the function shares is independent of the number of parties.
-> performance is slower since using Paillier cryptosystem.
-> it is useful for SUM and COUNT queries in bandwidth-sensitive setting.
-
-
-### Implementation and Evaluation
-- Implementation
-Splinter in C++ using OpenSSL
-> GMP: large integers
-> OpenMP: for multithreadings
-
-1. FSS library: 2000 LoC
-2. top application: 2000 LoC
-3. test code: 1500 LoC
-
-- Evaluation
- - Case studies: restaurant review site, flight search, map routing
- - Comparison to other private query systems
- - FSS microbenmarks: Two-party FSS, Multi-party FSS.
- - Hosting cost: for the server-side computation cost in Amazon EC2.
-
-The overall conclusion are three-folds:
-> 1. it can support realistic applications
-> 2. it can achieve end-to-end latencies below 1.6 sec
-> 3. it can use up to 10 \* fewer round trips than prior systems.
-
-## 2. Strength (Contributions of the paper)
-
-1. First, this paper extends previous FSS protocols for point and interval functions with additive aggregates such MAX/MIN and TopK at low computational and communication cost.
-
-2. Second, it also takes care of the implementation with optimized AES-NI instructions and multicore CPUs.
-
-
-## 3. Weakness (Limitations of the paper)
-1. In Splinter architecture, each provider needs to host a copy of the data. This introduces the overhead of maintaining data consistency.
-
-2. Splinter cannot handle a large number of disjoint conditions, this is a place that can be considered to solve in the future.
-
-3. For the aspect of the economic feasibility, Splinter disables the provider to mine user data, which maybe hard to be accepted in current providers.
-
-4. Although this paper shows the Splinter can support many application in its evaluation part, it still can only support a subset of SQL. There still exists many unsupported queries, such as join operation.
-
-## 4. Some Insights (Future work)
-- From this paper, we can learn that instead of using only one provider, using multiple provider can hide the privacy of user queries.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Splinter: Practical Private Queries on Public Data
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| NSDI'17 | Function Secret Sharing |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+Many online services let users query large public datasets.
+> any user can query the data, and the datasets themselves are not sensitive.
+
+Malicious servers can infer the user privacy via the user query. (privacy leakage)
+
+- State-of-art limitations
+Previous private query systems have generally not achieved practical performance.
+> expensive cryptographic primitives and protocols.
+> high computational cost
+
+- Goal
+This paper intends to design a system that protects user's queries on public datasets while achieving **practical performance for many current web applications**.
+
+### Splinter
+- Main idea:
+The client splits each user query into shares and sends them to multiple providers.
+> It then combines their results to the final answer.
+> The user's query remains private as long as any one provider is honest.
+
+
+
+- Security goal
+1. hide sensitive parameters in a user's query.
+> query parameter and result
+
+2. does not protect against providers returning incorrect results or maliciously modifying the dataset.
+
+- Function Secret Sharing
+a client divides a function $f$ into function shares $f_1,f_2,......,f_k$ so that multiple parties can help evaluate $f$ without learning certain of its parameters.
+
+Practical FSS protocols only exists for **point** and **interval** functions.
+> Additivity of FSS: low communication costs.
+
+- Executing Splinter queries
+ - Splinter client builds function shares.
+ - Client sends the query with all the secret parameters removed to each provider, along with that provider's share of the condition function.
+ - each provider divides its data into groups using grouping expressions.
+ - the provider runs an evaluation protocol that depends on the aggregate function and on properties of the condition.
+
+Splinter query algorithm for aggerates depends on condition type.
+1. Condition Types and Classes
+> 1. single-value conditions: can only be true on one combination of the values of $(e_1, ..., e_t)$ (*Equality-only conditions*)
+> 2. interval conditions: condition is true on an interval of the range values ()
+> 3. Disjoint conditions: (*OR*)
+
+2. Aggregate Evaluation
+> 1. sum-based aggregates (*SUM* and *COUNT*)
+> 2. MAX and MIN
+> 3. TOPK
+
+
+In MAX MIN, and TOPK,
+
+**Complexity Summary**:
+
+
+
+In all cases, the computation time is $O(nlogn)$ and the communication costs are much smaller than the size of the database.
+
+
+- Optimized FSS Implementation
+Splinter includes an FSS implementation optimized for modern hardware.
+
+1. using one-way compression functions
+Splinter uses the Matyas-Meyer-Oseas one-way function
+> for fixed key cipher
+
+Splinter initializes the cipher at the beginning of the query and reuses it for the rest of the query
+> avoiding expensive AES initialization operations in the FSS protocol.
+
+2. select the correct multi-party FSS protocol
+Splinter proposes two different schemes
+> offer different trade-offs between bandwidth and CPU usage.
+
+**Multi-Party FSS with one-way functions**: the provider responds with all the records that match for a particular user-provided condition, and the client performs aggregation locally.
+> provide the fastest response times on low-latency networks
+> it is not suitable for the bandwidth-sensitive environments
+
+**Multi-Party FSS with Paillier**: has the same additive properties as the two-party FSS protocol.
+> the size of the function shares is independent of the number of parties.
+> performance is slower since using Paillier cryptosystem.
+> it is useful for SUM and COUNT queries in bandwidth-sensitive setting.
+
+
+### Implementation and Evaluation
+- Implementation
+Splinter in C++ using OpenSSL
+> GMP: large integers
+> OpenMP: for multithreadings
+
+1. FSS library: 2000 LoC
+2. top application: 2000 LoC
+3. test code: 1500 LoC
+
+- Evaluation
+ - Case studies: restaurant review site, flight search, map routing
+ - Comparison to other private query systems
+ - FSS microbenmarks: Two-party FSS, Multi-party FSS.
+ - Hosting cost: for the server-side computation cost in Amazon EC2.
+
+The overall conclusion are three-folds:
+> 1. it can support realistic applications
+> 2. it can achieve end-to-end latencies below 1.6 sec
+> 3. it can use up to 10 \* fewer round trips than prior systems.
+
+## 2. Strength (Contributions of the paper)
+
+1. First, this paper extends previous FSS protocols for point and interval functions with additive aggregates such MAX/MIN and TopK at low computational and communication cost.
+
+2. Second, it also takes care of the implementation with optimized AES-NI instructions and multicore CPUs.
+
+
+## 3. Weakness (Limitations of the paper)
+1. In Splinter architecture, each provider needs to host a copy of the data. This introduces the overhead of maintaining data consistency.
+
+2. Splinter cannot handle a large number of disjoint conditions, this is a place that can be considered to solve in the future.
+
+3. For the aspect of the economic feasibility, Splinter disables the provider to mine user data, which maybe hard to be accepted in current providers.
+
+4. Although this paper shows the Splinter can support many application in its evaluation part, it still can only support a subset of SQL. There still exists many unsupported queries, such as join operation.
+
+## 4. Some Insights (Future work)
+- From this paper, we can learn that instead of using only one provider, using multiple provider can hide the privacy of user queries.
+
diff --git a/StoragePaperNote/Sketch/Count-MinSketch.md b/StoragePaperNote/Sketch/Count-MinSketch.md
old mode 100644
new mode 100755
index 8238c80..2121e88
--- a/StoragePaperNote/Sketch/Count-MinSketch.md
+++ b/StoragePaperNote/Sketch/Count-MinSketch.md
@@ -1,62 +1,62 @@
----
-typora-copy-images-to: paper_figure
----
-An improved data stream summary: the count-min sketch and its applications
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------: |
-| Journal of Algorithms 2003 | sketch |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-In data stream scenario, algorithm for computing data stream context needs to satisfy the following requirement:
-1. the space used by the algorithm should be small, at most polylogarithmic in $n$.
-2. processing an update should be fast and simple, with a usable accuracy guarantees.
-
-This paper proposes a new sketch construction with
->1. space used is proportional to $\frac{1}{\varepsilon}$
->2. the update time is sublinear in the size of the sketch
->3. improve the space bounds of previous results from $\frac{1}{{\varepsilon}^2}$ to $\frac{1}{\varepsilon}$
->4. improve the time bounds from $\frac{1}{{\varepsilon}^2}$ to 1.
-
-### Count-Min Sketch
-Two parameters:
-> $\varepsilon$ and $\delta$: the error in answering a query is within a factor of $\varepsilon$ with probability $1 - \delta$
-
-The space and update time will consequently depend on $\varepsilon$ and $\delta$.
-
-It is named after the two basic operations used to answer point queries, counting first and computing the minimum next.
-
-- Three types:
->1. a point query
->2. a range query
->3. an inner product query
-
-- Data structure
-Two-dimensional array counts with width $w$ and depth $d$, set $w = \frac{e}{\varepsilon}$, and $d = ln(\frac{1}{\delta})$
-$d$ hash functions:
-$$
-h_1,..., h_d \space (\{1,...,n\} \rightarrow \{1,...,w\})
-$$
-
-- Update procedure
-An update $(i_t, c_t)$ arrives, i.e., item $a_{i_t}$ is updated by a quantity of $c_t$
-> 1. $c_t$ is added to one count in each row, and the counter is determined by hash function $h_j$.
-> 2. the array of $w \times d$ counts
-$$
-count[j, h_j(i_t)] \leftarrow count[j, h_j(i_t)] + c_t, \forall 1 \leq j \leq d
-$$
-
-
-- The analysis of point query
-
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+An improved data stream summary: the count-min sketch and its applications
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------: |
+| Journal of Algorithms 2003 | sketch |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+In data stream scenario, algorithm for computing data stream context needs to satisfy the following requirement:
+1. the space used by the algorithm should be small, at most polylogarithmic in $n$.
+2. processing an update should be fast and simple, with a usable accuracy guarantees.
+
+This paper proposes a new sketch construction with
+>1. space used is proportional to $\frac{1}{\varepsilon}$
+>2. the update time is sublinear in the size of the sketch
+>3. improve the space bounds of previous results from $\frac{1}{{\varepsilon}^2}$ to $\frac{1}{\varepsilon}$
+>4. improve the time bounds from $\frac{1}{{\varepsilon}^2}$ to 1.
+
+### Count-Min Sketch
+Two parameters:
+> $\varepsilon$ and $\delta$: the error in answering a query is within a factor of $\varepsilon$ with probability $1 - \delta$
+
+The space and update time will consequently depend on $\varepsilon$ and $\delta$.
+
+It is named after the two basic operations used to answer point queries, counting first and computing the minimum next.
+
+- Three types:
+>1. a point query
+>2. a range query
+>3. an inner product query
+
+- Data structure
+Two-dimensional array counts with width $w$ and depth $d$, set $w = \frac{e}{\varepsilon}$, and $d = ln(\frac{1}{\delta})$
+$d$ hash functions:
+$$
+h_1,..., h_d \space (\{1,...,n\} \rightarrow \{1,...,w\})
+$$
+
+- Update procedure
+An update $(i_t, c_t)$ arrives, i.e., item $a_{i_t}$ is updated by a quantity of $c_t$
+> 1. $c_t$ is added to one count in each row, and the counter is determined by hash function $h_j$.
+> 2. the array of $w \times d$ counts
+$$
+count[j, h_j(i_t)] \leftarrow count[j, h_j(i_t)] + c_t, \forall 1 \leq j \leq d
+$$
+
+
+- The analysis of point query
+
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/Storage System/Ceph/CRUSH-SC'06.md b/StoragePaperNote/Storage System/Ceph/CRUSH-SC'06.md
old mode 100644
new mode 100755
index 2e7450f..17441fa
--- a/StoragePaperNote/Storage System/Ceph/CRUSH-SC'06.md
+++ b/StoragePaperNote/Storage System/Ceph/CRUSH-SC'06.md
@@ -1,42 +1,42 @@
----
-typora-copy-images-to: paper_figure
----
-# CRUSH: Controlled, Scalable, Decentralized Placement of Replicated Data
-@SC'06 @Ceph
-[TOC]
-
-## Summary
-***Motivation of this paper***: Most systems simply write new data to underutilized devices. This approach does not mixes old and new data together, leading to load imbalance. However, simply hash-based distribution fails to cope with changes in the number of devices, incurring **a massive reshuffling of data**. Thus this paper proposes a pseudo-random data distribution algorithm that efficiently and robustly distributes object replicas across **heterogeneous, structured storage cluster**.
-
-***CRUSH algorithm***:
-The two key goals of CRUSH are
->1. efficiency and scalability of the mapping algorithm.
->2. minimal data migration to restore a balanced distribution when the cluster changes due to the addition or removal of devices.
-
-- It distributes data objects among storage devices according to **a per-device weight value**, approximating a uniform robability distribution. Storage devices are assigned weights by the administrator to control the relative amount of data they are responsible for storing.
-- The data distribution policy is defined in terms of **placement rules** that specify how many replica targets are chosen from the cluster and what restrictions are imposed on replica placement.
-
-
-- **Bucket type**: Each bucket type is based on a different internal data structure and utilizes a different function $c(r, x)$ for pseudo-randomly choosing nested items during the replica placement process, representing a different **tradeoff** between computation and reorganization efficiency.
-- Map changes and data movement: The key idea of it is when an individual device fails, CRUSH flags the device but **leaves it in the hierarchy**. Such cluster map changes result in an optimal fraction, because only data on the failed device is moved and needs to be remapped to new storage targets.
-
-***Evaluation***:
-1. Data distribution: measure the distribution of objects across devices contained in a variety of bucket types and compared the variance in device utilization to the binomial probability distribution.
-2. Reorganization and Data Movement: Movement factor V.S OSDs added or removed, Movement factor V.S Original bucket size
-3. Algortithm: Time V.S Cluster size, Time V.S Bucket size
-## Strength (Contributions of the paper)
-1. This paper presents a pseudo-random data distribution algorithm with two key advantages:
->1) any party in a large system can **independently** calculate the location of any object.
->2) little metadata is required is mostly **static**, changing only when devices are added or removed
-
-2. Mapping calculations in CRUSH have $O(logn)$ running time, requiring only tens of microseconds to execute with thousands of devices.
-
-## Weakness (Limitations of the paper)
-1. Although the rule structure in this paper is currently enough to support the basic data distribution policies. How to design a more powerful rule structure?
-2. This algorithm highly relies on a suitably strong **multi-input integer hash function**. It can be a bottleneck of this algorithm.
-3. In the evaluation of reliability, it mentions it is difficult to quantify the magntitude of the improvement in overall system reliability in the absence of a specific **cluster configuration** and **associated historical failure data** to study.
-
-## Future Work
-1. For the first weakness, it can consider to design some more flexible rule structures
-2. For the second weakness, one can figure out faster hashing techniques
-
+---
+typora-copy-images-to: paper_figure
+---
+# CRUSH: Controlled, Scalable, Decentralized Placement of Replicated Data
+@SC'06 @Ceph
+[TOC]
+
+## Summary
+***Motivation of this paper***: Most systems simply write new data to underutilized devices. This approach does not mixes old and new data together, leading to load imbalance. However, simply hash-based distribution fails to cope with changes in the number of devices, incurring **a massive reshuffling of data**. Thus this paper proposes a pseudo-random data distribution algorithm that efficiently and robustly distributes object replicas across **heterogeneous, structured storage cluster**.
+
+***CRUSH algorithm***:
+The two key goals of CRUSH are
+>1. efficiency and scalability of the mapping algorithm.
+>2. minimal data migration to restore a balanced distribution when the cluster changes due to the addition or removal of devices.
+
+- It distributes data objects among storage devices according to **a per-device weight value**, approximating a uniform robability distribution. Storage devices are assigned weights by the administrator to control the relative amount of data they are responsible for storing.
+- The data distribution policy is defined in terms of **placement rules** that specify how many replica targets are chosen from the cluster and what restrictions are imposed on replica placement.
+
+
+- **Bucket type**: Each bucket type is based on a different internal data structure and utilizes a different function $c(r, x)$ for pseudo-randomly choosing nested items during the replica placement process, representing a different **tradeoff** between computation and reorganization efficiency.
+- Map changes and data movement: The key idea of it is when an individual device fails, CRUSH flags the device but **leaves it in the hierarchy**. Such cluster map changes result in an optimal fraction, because only data on the failed device is moved and needs to be remapped to new storage targets.
+
+***Evaluation***:
+1. Data distribution: measure the distribution of objects across devices contained in a variety of bucket types and compared the variance in device utilization to the binomial probability distribution.
+2. Reorganization and Data Movement: Movement factor V.S OSDs added or removed, Movement factor V.S Original bucket size
+3. Algortithm: Time V.S Cluster size, Time V.S Bucket size
+## Strength (Contributions of the paper)
+1. This paper presents a pseudo-random data distribution algorithm with two key advantages:
+>1) any party in a large system can **independently** calculate the location of any object.
+>2) little metadata is required is mostly **static**, changing only when devices are added or removed
+
+2. Mapping calculations in CRUSH have $O(logn)$ running time, requiring only tens of microseconds to execute with thousands of devices.
+
+## Weakness (Limitations of the paper)
+1. Although the rule structure in this paper is currently enough to support the basic data distribution policies. How to design a more powerful rule structure?
+2. This algorithm highly relies on a suitably strong **multi-input integer hash function**. It can be a bottleneck of this algorithm.
+3. In the evaluation of reliability, it mentions it is difficult to quantify the magntitude of the improvement in overall system reliability in the absence of a specific **cluster configuration** and **associated historical failure data** to study.
+
+## Future Work
+1. For the first weakness, it can consider to design some more flexible rule structures
+2. For the second weakness, one can figure out faster hashing techniques
+
diff --git a/StoragePaperNote/Storage System/Ceph/Ceph-OSDI'06.md b/StoragePaperNote/Storage System/Ceph/Ceph-OSDI'06.md
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/Storage System/Ceph/RADOS-PDSW'07.md b/StoragePaperNote/Storage System/Ceph/RADOS-PDSW'07.md
old mode 100644
new mode 100755
index d8fe27e..19ead02
--- a/StoragePaperNote/Storage System/Ceph/RADOS-PDSW'07.md
+++ b/StoragePaperNote/Storage System/Ceph/RADOS-PDSW'07.md
@@ -1,38 +1,38 @@
----
-typora-copy-images-to: paper_figure
----
-# RADOS: A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters
-@PDSW'07 @Ceph
-[TOC]
-
-## Summary
-***Motivation of this paper***: In OSD, the storage systems largely fail to exploit device intelligence. Consistent management of data placement, failure detection, and failure recovery places an increasingly large burden on client, controller, or metadata directory nodes, **limiting scalability**. This paper wants to leverage the device intelligence to solve this issue by distributing the complexity.
-
-***Scalable Cluster Management***
-- Cluster Map: In RADOS system, the monitor cluster manages the storage cluster by using the **cluster map**. It includes two key information:
->1. which OSDs are included in the cluster
->2. distribution of all data in the system
-
-The cluster map is replicated by every storage node. Because cluster map changes may be frequent. Thus, its updates are distributed as **incremental maps**.
-
-- Data Placement: RADOS maps each object to a **placement group (PG)** by using the **CRUSH**, which can maintain a balanced distribution.
-
-***Intelligent Storage Devices and Monitors***
-Because the knowledge of the data distribution encapsulated in the cluster map, this feature allows OSDs to distribute management:
->1. Data redundancy: three replication scheme
->2. Failure detection
->3. Failure recovery: parallelize failure recovery
-
-- Monitors: a cluster of monitors are responsible for managing the storage system by storing the master copy of the cluster map and making periodic updates in response to configuration changes or changes in OSD state. Using **Paxos part-time parliament algortihm** and **lease mechanism** to favor consistency.
-
-
-## Strength (Contributions of the paper)
-This paper presents more details of scalable cluster management (cluster map), intelligenr storage devices (failure detection, failure recovery, replication), monitors (paxos service) in Ceph.
-
-## Weakness (Limitations of the paper)
-1. For the aspect of load balance, this paper does not mention how to solve the issue of many clients accessing a single popular object.
-2. In addition to $n-way$ replication, this paper does not mention how to support parity-based redundancy to improve the storage efficiently.
-
-## Future Work
-I think those two weaknesses can be regared as the research direction in the future.
-
+---
+typora-copy-images-to: paper_figure
+---
+# RADOS: A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters
+@PDSW'07 @Ceph
+[TOC]
+
+## Summary
+***Motivation of this paper***: In OSD, the storage systems largely fail to exploit device intelligence. Consistent management of data placement, failure detection, and failure recovery places an increasingly large burden on client, controller, or metadata directory nodes, **limiting scalability**. This paper wants to leverage the device intelligence to solve this issue by distributing the complexity.
+
+***Scalable Cluster Management***
+- Cluster Map: In RADOS system, the monitor cluster manages the storage cluster by using the **cluster map**. It includes two key information:
+>1. which OSDs are included in the cluster
+>2. distribution of all data in the system
+
+The cluster map is replicated by every storage node. Because cluster map changes may be frequent. Thus, its updates are distributed as **incremental maps**.
+
+- Data Placement: RADOS maps each object to a **placement group (PG)** by using the **CRUSH**, which can maintain a balanced distribution.
+
+***Intelligent Storage Devices and Monitors***
+Because the knowledge of the data distribution encapsulated in the cluster map, this feature allows OSDs to distribute management:
+>1. Data redundancy: three replication scheme
+>2. Failure detection
+>3. Failure recovery: parallelize failure recovery
+
+- Monitors: a cluster of monitors are responsible for managing the storage system by storing the master copy of the cluster map and making periodic updates in response to configuration changes or changes in OSD state. Using **Paxos part-time parliament algortihm** and **lease mechanism** to favor consistency.
+
+
+## Strength (Contributions of the paper)
+This paper presents more details of scalable cluster management (cluster map), intelligenr storage devices (failure detection, failure recovery, replication), monitors (paxos service) in Ceph.
+
+## Weakness (Limitations of the paper)
+1. For the aspect of load balance, this paper does not mention how to solve the issue of many clients accessing a single popular object.
+2. In addition to $n-way$ replication, this paper does not mention how to support parity-based redundancy to improve the storage efficiently.
+
+## Future Work
+I think those two weaknesses can be regared as the research direction in the future.
+
diff --git a/StoragePaperNote/Storage System/Cumulus-FAST'09.md b/StoragePaperNote/Storage System/Cumulus-FAST'09.md
old mode 100644
new mode 100755
index d759a28..abc4308
--- a/StoragePaperNote/Storage System/Cumulus-FAST'09.md
+++ b/StoragePaperNote/Storage System/Cumulus-FAST'09.md
@@ -1,134 +1,134 @@
----
-typora-copy-images-to: ../paper_figure
----
-Cumulus: Filesystem Backup to the Cloud
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'09 | Backup system |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-Most existing cloud-based backup services implement integrated solutions that include backup-up specific software hosted on both **the client and at the data center**.
-> this approach allows greater storage and bandwidth efficiency, but reduce the portability (locking customers into a particular provider)
-
-This paper intends to explore the **thin cloud** with a **minimal interface** (put, get, delete, list)
-> portable for any on-line storage services
-
-- Main goal
-Cumulus tries to be network-friendly like rsync-basd tools, while using only a generic storage interface.
-
-### Design
-- Storage server interface
-1. **Get**: input: the pathname, retrieve the contents of a file from the server
-2. **Put**: input: the pathname, store a complete file on the server
-3. **List**: get the names of files stored on the server
-4. **Delete**: remove the given file from the server
-
-All of these operations operate on entire files.
-> the interface is simple enough that it can be implemented on top of any number of protocols.
-
-- Write-one storage model
-the only way to modify a file in this narrow interface is to upload it again **in full**
-> a file is never modified after it is first stored, except to delete it to recover space.
-> do not modify files **in place**.
-
-- Storage segments
-1. Filesystem always contains many small files.
-2. When storing a snapshot, Cumulus often groups data from many smaller files together into large units. (segments)
-3. Amazon S3 encourages using files greater than $100KB$ in size.
-
-Why?
-Provide additional privacy when encryption is used:
-Aggregation hide the size as well as contents of individual files.
-
-- Snapshot format
-Snapshots logically consist of two parties:
-> 1. metadata log: lists all files backed up
-> 2. the data itself.
-
-Both two parties are broken apart into blocks, and these objects are then packed together into segments.
-
-**Snapshot descriptors**: the one piece of data in each snapshot not stored in a segment
-
-> include a timestamp and a pointer to the root object.
-
-
-Starting with the root object stored in the snapshot descriptor and traversing all pointers found, a list of all segments required by the snapshot can be constructed.
-
-
-- Segment cleaning (GC)
-1. in-place cleaning: identifies segments with wasted space and rewrites the segments to keep just the needed data.
-> **Cumulus**: does not clean in place.
-
-2. The simplest policy: set a minimum segment utilization threshold.
-utilization: the fraction of bytes within the segment which are referenced by a current snapshot.
-
-- Restore
- - Complete restore: extracts all files as they were on a given date
- - Partial restore: recovers one or a small number of files.
-
-Cumulus is primarily optimized for the **complete restore**.
-
-
-
-### Implementation and Evaluation
-- Implementation
-Source code
-> 3200 LoC c++ for core backup functionality.
-> 1000 LoC python for restore, segment cleaning, and statistics gathering.
-
-1. Local client state
-each client stores on its **local disk** information about recent backups.
-> storing it locally reduces network bandwidth and improves access times.
-> two parties: 1) a local copy of the metadata log, 2) SQLite database containing all other needed information,
-
-Cumulus uses the local copy of the previous metadata log to quickly detect and skip over unchanged files based on **modification time**.
-
-Database: keeps a record of recent snapshots and all segments and objects stored in them.
-> an index by content hash to support data deduplication.
-
-2. Segment cleaning
-Using Python controls cleaning.
-> when writing a snapshot, Cumulus records in the local database a summary of all segments used by the snapshot and the fraction of the data in each segment that is actually referenced.
-
-3. Segment filtering and storage
-Cumulus backup implementation is only capable of writing segments as **uncompressed TAR** files to local disk.
-> additional functionality is implemented by calling out to **external scripts**.
-> using **gzip** to provide compression
-> using **boto** library to interact with AWS
-
-- Evaluation
-1. Trace workload: fileserver, user. (private dataset)
-2. Simulation and prototype evaluation
-
-**Prototype evaluation**
-1. compare with Jungle Disk and Brackup.
-
-3. sub-file incrementals
-> comparison with **rdiff** (rsync-style)
-
-4. upload time
-> using **boto** Python library to interface with Amazon S3.
-
-5. restore time
-
-
-
-## 2. Strength (Contributions of the paper)
-
-1. provide both simulation and experimental results of Cumulus performance, overhead and cost in trace-driven scenarios.
-
-2. Cumulus can leave the backup itself almost entirely opaque to the server.
-> make Cumulus portable to nearly any type of storage server.
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
-1. It also mentions Cumulus does not design deduplication between backups from different clients.
-> Since server-side deduplication is vulnerable to dictionary attacks to determine what data clients are storing, and storage accounting for billing purpose is more difficult.
-
+---
+typora-copy-images-to: ../paper_figure
+---
+Cumulus: Filesystem Backup to the Cloud
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'09 | Backup system |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+Most existing cloud-based backup services implement integrated solutions that include backup-up specific software hosted on both **the client and at the data center**.
+> this approach allows greater storage and bandwidth efficiency, but reduce the portability (locking customers into a particular provider)
+
+This paper intends to explore the **thin cloud** with a **minimal interface** (put, get, delete, list)
+> portable for any on-line storage services
+
+- Main goal
+Cumulus tries to be network-friendly like rsync-basd tools, while using only a generic storage interface.
+
+### Design
+- Storage server interface
+1. **Get**: input: the pathname, retrieve the contents of a file from the server
+2. **Put**: input: the pathname, store a complete file on the server
+3. **List**: get the names of files stored on the server
+4. **Delete**: remove the given file from the server
+
+All of these operations operate on entire files.
+> the interface is simple enough that it can be implemented on top of any number of protocols.
+
+- Write-one storage model
+the only way to modify a file in this narrow interface is to upload it again **in full**
+> a file is never modified after it is first stored, except to delete it to recover space.
+> do not modify files **in place**.
+
+- Storage segments
+1. Filesystem always contains many small files.
+2. When storing a snapshot, Cumulus often groups data from many smaller files together into large units. (segments)
+3. Amazon S3 encourages using files greater than $100KB$ in size.
+
+Why?
+Provide additional privacy when encryption is used:
+Aggregation hide the size as well as contents of individual files.
+
+- Snapshot format
+Snapshots logically consist of two parties:
+> 1. metadata log: lists all files backed up
+> 2. the data itself.
+
+Both two parties are broken apart into blocks, and these objects are then packed together into segments.
+
+**Snapshot descriptors**: the one piece of data in each snapshot not stored in a segment
+
+> include a timestamp and a pointer to the root object.
+
+
+Starting with the root object stored in the snapshot descriptor and traversing all pointers found, a list of all segments required by the snapshot can be constructed.
+
+
+- Segment cleaning (GC)
+1. in-place cleaning: identifies segments with wasted space and rewrites the segments to keep just the needed data.
+> **Cumulus**: does not clean in place.
+
+2. The simplest policy: set a minimum segment utilization threshold.
+utilization: the fraction of bytes within the segment which are referenced by a current snapshot.
+
+- Restore
+ - Complete restore: extracts all files as they were on a given date
+ - Partial restore: recovers one or a small number of files.
+
+Cumulus is primarily optimized for the **complete restore**.
+
+
+
+### Implementation and Evaluation
+- Implementation
+Source code
+> 3200 LoC c++ for core backup functionality.
+> 1000 LoC python for restore, segment cleaning, and statistics gathering.
+
+1. Local client state
+each client stores on its **local disk** information about recent backups.
+> storing it locally reduces network bandwidth and improves access times.
+> two parties: 1) a local copy of the metadata log, 2) SQLite database containing all other needed information,
+
+Cumulus uses the local copy of the previous metadata log to quickly detect and skip over unchanged files based on **modification time**.
+
+Database: keeps a record of recent snapshots and all segments and objects stored in them.
+> an index by content hash to support data deduplication.
+
+2. Segment cleaning
+Using Python controls cleaning.
+> when writing a snapshot, Cumulus records in the local database a summary of all segments used by the snapshot and the fraction of the data in each segment that is actually referenced.
+
+3. Segment filtering and storage
+Cumulus backup implementation is only capable of writing segments as **uncompressed TAR** files to local disk.
+> additional functionality is implemented by calling out to **external scripts**.
+> using **gzip** to provide compression
+> using **boto** library to interact with AWS
+
+- Evaluation
+1. Trace workload: fileserver, user. (private dataset)
+2. Simulation and prototype evaluation
+
+**Prototype evaluation**
+1. compare with Jungle Disk and Brackup.
+
+3. sub-file incrementals
+> comparison with **rdiff** (rsync-style)
+
+4. upload time
+> using **boto** Python library to interface with Amazon S3.
+
+5. restore time
+
+
+
+## 2. Strength (Contributions of the paper)
+
+1. provide both simulation and experimental results of Cumulus performance, overhead and cost in trace-driven scenarios.
+
+2. Cumulus can leave the backup itself almost entirely opaque to the server.
+> make Cumulus portable to nearly any type of storage server.
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
+1. It also mentions Cumulus does not design deduplication between backups from different clients.
+> Since server-side deduplication is vulnerable to dictionary attacks to determine what data clients are storing, and storage accounting for billing purpose is more difficult.
+
2. Cumulus also detects changes to files in the traces using a **per-file hash**.
\ No newline at end of file
diff --git a/StoragePaperNote/Storage System/HDFS-MSST'10.md b/StoragePaperNote/Storage System/HDFS-MSST'10.md
old mode 100644
new mode 100755
index cb00e02..87fd45b
--- a/StoragePaperNote/Storage System/HDFS-MSST'10.md
+++ b/StoragePaperNote/Storage System/HDFS-MSST'10.md
@@ -1,101 +1,101 @@
----
-typora-copy-images-to: paper_figure
----
-# The Hadoop Distributed File System
-@MSST'10 @Introduction
-[TOC]
-
-## Summary
-***Motivation of this paper***:
-
-This paper describes the architecture of HDFS and reports on experience of using HDFS to manage 25 petabytes of enterprise data at Yahpp.`
-
-## HDFS
-- HDFS stores file system metadata and application data separately.
->1. NameNode: store metadata on a dedicated server
->2. DataNode: store application data on other servers
-
-All servers are fully connected and communicate with each other using TCP-based protocols.
-
-### Architecture
-- NameNode
-The HDFS namespace is a hierarchy of files and directories.
-> Files and directories are represented on the NameNode by **inodes** (e.g., permissions, modification and access times, namespace and disk quotas)
-
-The NameNode maintains the namespace tree and the mapping of file blocks to DataNode. A HDFS client firstly contacts the NameNode for locations of data blocks comprising the file and then reads block contents from the DataNode closest to the client.
-
-- DataNodes
-Each block replica on a DataNode is represented by two files in the local host's native file system.
->1. the data itself
->2. block's metadata including the checksums for the block data and the block's generation stamp
-
-During the startup, each DataNode connects to the NameNode and preforms a **handshake**.
-> Purpose: verify the namespace ID and the software version
-
-If either does not match that of the NameNode the DataNode automatically shuts down. After the handshake, DataNode registers with the NameNode.
-> storage ID is assigned to the DataNode when it registers with the NameNode for the first time and never chanegs after that.
-
-- Block Report
-DataNode sends to NameNode the block reports including the *block id*, the *generation stamp*, and the *length* for each block replica the server hosts.
-
-- Heartbeat
-From the a DataNode, it also carries information about total storage capacity, fraction of storage in use, and the number of data transfers currently in progress.
-
-NameNode uses replies to heartbeats to send instructions to the DataNodes.
-
-
-### HDFS Client
-HDFS client is a code library that exports the HDFS file system interface. (e.g., read, write, delete files, and operations to create and delete directories)
-
-
-When a client opens a file to read to , it fetches the list of blocks and the locations of each block replica from the NameNode.
-
-## File I/O Operations and Replica Management
-HDFS implements a single-writer, multiple-reader model.
-> The HDFS client that opens a file for writing is granted a lease for the file, no other client can write to file.
-
-### Data Pipeline
-
-The DataNodes form a pipeline, the order of which minimizes the total network distance from the client to the last DataNode.
-
-Bold lines represent data packets, dashed lines represent acknowledge messages, thin lines represent control mesages to setup and close the pipeline.
-
-### Block Placement
-When a DataNode registers with the NameNode, the NameNode runs a configured script to decide which rack the node belongs to.
-
-- The default HDFS block placement policy provides a tradeoff between minimizing the write cost, and maximizing data reliability, availability, and aggregate read bandwidth
->1. places the first replica on the node where the writer is located
->2. places the second and third replicas on two different racks.
-
-This policy reduces the inter-rack and inter-node write traffic and generally improves write performance.
-
-
-## Practice at Yahoo
-In this part, this paper demonstrates the configuration and data in Yahoo.
-
-For benchmark, it uses DFSIO benchmark to measure average throughput for read, write and append operations.
-> DFSIO is an application available as part of the Hadoop distribution. It is designed to measure performance only during data transfer, and excludes the overheads of task scheduling startup, and the reduce task.
-
-The **NNThroughput** benchmark:
-> it is a single node process which starts the NameNode application and runs a series of client threads on the same node.
-
-
-## Futhre Work
-1. Single Failure: The Hadoop cluster is effectively unavailable when its NameNode is down. It has taken steps towards automated failover.
-> Using BackupNode
-
-2. Scalability of the NameNode: it has been a key struggle. The main challenge with the NameNode has been that when its memory usage is close to the maximum tha NameNode becomes unrespomsive due to Java garbage collection and sometimes requires a restart.
-> their near-term solution to scalability is to allow **multiple** namespaces (and NameNodes) to share the physical storage within a cluster.
-> the main drawback of multiple independent namespaces is the cost of managing them, especially, if the number of the namespaces is large.
-
-
-
-
-
-
-
-
-
-
-
-
+---
+typora-copy-images-to: paper_figure
+---
+# The Hadoop Distributed File System
+@MSST'10 @Introduction
+[TOC]
+
+## Summary
+***Motivation of this paper***:
+
+This paper describes the architecture of HDFS and reports on experience of using HDFS to manage 25 petabytes of enterprise data at Yahpp.`
+
+## HDFS
+- HDFS stores file system metadata and application data separately.
+>1. NameNode: store metadata on a dedicated server
+>2. DataNode: store application data on other servers
+
+All servers are fully connected and communicate with each other using TCP-based protocols.
+
+### Architecture
+- NameNode
+The HDFS namespace is a hierarchy of files and directories.
+> Files and directories are represented on the NameNode by **inodes** (e.g., permissions, modification and access times, namespace and disk quotas)
+
+The NameNode maintains the namespace tree and the mapping of file blocks to DataNode. A HDFS client firstly contacts the NameNode for locations of data blocks comprising the file and then reads block contents from the DataNode closest to the client.
+
+- DataNodes
+Each block replica on a DataNode is represented by two files in the local host's native file system.
+>1. the data itself
+>2. block's metadata including the checksums for the block data and the block's generation stamp
+
+During the startup, each DataNode connects to the NameNode and preforms a **handshake**.
+> Purpose: verify the namespace ID and the software version
+
+If either does not match that of the NameNode the DataNode automatically shuts down. After the handshake, DataNode registers with the NameNode.
+> storage ID is assigned to the DataNode when it registers with the NameNode for the first time and never chanegs after that.
+
+- Block Report
+DataNode sends to NameNode the block reports including the *block id*, the *generation stamp*, and the *length* for each block replica the server hosts.
+
+- Heartbeat
+From the a DataNode, it also carries information about total storage capacity, fraction of storage in use, and the number of data transfers currently in progress.
+
+NameNode uses replies to heartbeats to send instructions to the DataNodes.
+
+
+### HDFS Client
+HDFS client is a code library that exports the HDFS file system interface. (e.g., read, write, delete files, and operations to create and delete directories)
+
+
+When a client opens a file to read to , it fetches the list of blocks and the locations of each block replica from the NameNode.
+
+## File I/O Operations and Replica Management
+HDFS implements a single-writer, multiple-reader model.
+> The HDFS client that opens a file for writing is granted a lease for the file, no other client can write to file.
+
+### Data Pipeline
+
+The DataNodes form a pipeline, the order of which minimizes the total network distance from the client to the last DataNode.
+
+Bold lines represent data packets, dashed lines represent acknowledge messages, thin lines represent control mesages to setup and close the pipeline.
+
+### Block Placement
+When a DataNode registers with the NameNode, the NameNode runs a configured script to decide which rack the node belongs to.
+
+- The default HDFS block placement policy provides a tradeoff between minimizing the write cost, and maximizing data reliability, availability, and aggregate read bandwidth
+>1. places the first replica on the node where the writer is located
+>2. places the second and third replicas on two different racks.
+
+This policy reduces the inter-rack and inter-node write traffic and generally improves write performance.
+
+
+## Practice at Yahoo
+In this part, this paper demonstrates the configuration and data in Yahoo.
+
+For benchmark, it uses DFSIO benchmark to measure average throughput for read, write and append operations.
+> DFSIO is an application available as part of the Hadoop distribution. It is designed to measure performance only during data transfer, and excludes the overheads of task scheduling startup, and the reduce task.
+
+The **NNThroughput** benchmark:
+> it is a single node process which starts the NameNode application and runs a series of client threads on the same node.
+
+
+## Futhre Work
+1. Single Failure: The Hadoop cluster is effectively unavailable when its NameNode is down. It has taken steps towards automated failover.
+> Using BackupNode
+
+2. Scalability of the NameNode: it has been a key struggle. The main challenge with the NameNode has been that when its memory usage is close to the maximum tha NameNode becomes unrespomsive due to Java garbage collection and sometimes requires a restart.
+> their near-term solution to scalability is to allow **multiple** namespaces (and NameNodes) to share the physical storage within a cluster.
+> the main drawback of multiple independent namespaces is the cost of managing them, especially, if the number of the namespaces is large.
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/StoragePaperNote/Storage System/Multi-Cloud/Kurma-SYSTOR'19.md b/StoragePaperNote/Storage System/Multi-Cloud/Kurma-SYSTOR'19.md
old mode 100644
new mode 100755
index 7371931..7a30f4c
--- a/StoragePaperNote/Storage System/Multi-Cloud/Kurma-SYSTOR'19.md
+++ b/StoragePaperNote/Storage System/Multi-Cloud/Kurma-SYSTOR'19.md
@@ -1,97 +1,97 @@
----
-typora-copy-images-to: ../paper_figure
----
-Kurma: Secure Geo-Distributed Multi-Cloud Storage Gateways
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SYSTOR'19 | Secure Multi-Cloud Storage|
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-Many cannot store data in cloud due the security conerns and legacy infrastructure such as **network-attached** storage (NAS). This paper wants to design a multi-cloud storage gateway system to allow NAS-based programs to seamlessly and securely access cloud storage.
-### Kurma
-- Threat Model
-Kurma considers the on-premises gateways trusted, and the public cloud untrusted.
-> Transferring data to clouds is vulnerable to man-in-the-middle attacks.
-> Eventually-consistent clouds may return stale data.
-
-- Design Goal:
-1. Strong security
-2. High availability: no single point failure
-3. High performance: overcome high latency of remote cloud storage.
-4. High flexibility: support trade-off among security, availability, performance, and cost.
-> replication, erasure coding and secret sharing
-> trade-oofs among availability, performance, and costs.
-
-
-
-
-- Architecture
-1. File system metadata: each Kurma gateway runs a separate ZooKeeper instance that stores a **full replica** of the whole file-system metadata.
-2. use a persistent write-back cache to avoid the long latency of a cloud accesses.
-
-
-
-
-- Metadata Management
-Kurma gateway maintains a replica of the entire file-system metadata in a local ZooKeeper instance.
-> metadata operations can be processed without synchronizing with other gateways.
-> store the metadata in-memory to achieve fast operations.
-
-Minimizing the memory footprint using three stategies:
-1. use a large block size, reduce the amount of blocks, and improve the throughput.
-2. compresses its metadata, via data locality.
-3. only store version number and GatewayID in gateway, and other metadata in cloud.
-
-
-- Security
-Kurma stores each encrypted block as a key-value object:
-> key is derived from the block's metadata.
-> ensure that each version of a block hash a unique cloud object key
-> attacker cannot tell whether two objects are two versions of the same block.
-
-
-
-
-
-- Multi-Cloud
-1. Replication
-2. Erasure Coding
-3. Secret Sharing
-
-- File Sharing Across Gateways
-close-to-open consistency: when a client opens a file, the client sees all changes made by other clients in the same region who closed the file before the open.
-
-- Persistent Caching
-Kurma NFS server has a persistent cache so that hot data can be read in the low-latency on-premises network instead of from remote clouds.
-
-
-### Implementation and Evaluation
-- Implementation:
-Kurma NFS Server: 15800 lines C++
-Gateway Server: 22700 lines java
-**Optimization**:
-> 1. using a separate thread to pre-compute a pool of *keymap*s (RSA keymap), so that Kurma can quickly take one *keymap* out of the pool when creating a file.
-> 2. batch multiple ZooKeeper changes into a single ZooKeeper transcation.
-> 3. Latencies of clouds vary significantly over time: Kurma sorts cloud providers in every $N$ seconds, and uses the $K$ fastest clouds as backends.
-
-- Evaluation
-Compare with a baseline using a single-node NFS server.
-> Metadata operation, data operations.
-
-## 2. Strength (Contributions of the paper)
-1. This paper proposes a secure geo-distributed multi-cloud storage gateway system, which keeps file-system metadata in trusted gateways, and stores the encryted data block in back-end clouds.
-> a geo-distributed file system.
-
-2. To solve the issue of metadata freshness, Kurma embeds a version number and a timestamp into each file block to ensure data freshness.
-
-
-## 3. Weakness (Limitations of the paper)
-1. Network paritions may disrupt the replication of version numbers among gateways. This can make Kurma cannot guarantee that a client always see all updates made by clients in other regios.
-
-
-## 4. Future Works
-1. this paper shows a new architecture of network storage system, the key is let a trusted gateway to storage the metadata.
+---
+typora-copy-images-to: ../paper_figure
+---
+Kurma: Secure Geo-Distributed Multi-Cloud Storage Gateways
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| SYSTOR'19 | Secure Multi-Cloud Storage|
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+Many cannot store data in cloud due the security conerns and legacy infrastructure such as **network-attached** storage (NAS). This paper wants to design a multi-cloud storage gateway system to allow NAS-based programs to seamlessly and securely access cloud storage.
+### Kurma
+- Threat Model
+Kurma considers the on-premises gateways trusted, and the public cloud untrusted.
+> Transferring data to clouds is vulnerable to man-in-the-middle attacks.
+> Eventually-consistent clouds may return stale data.
+
+- Design Goal:
+1. Strong security
+2. High availability: no single point failure
+3. High performance: overcome high latency of remote cloud storage.
+4. High flexibility: support trade-off among security, availability, performance, and cost.
+> replication, erasure coding and secret sharing
+> trade-oofs among availability, performance, and costs.
+
+
+
+
+- Architecture
+1. File system metadata: each Kurma gateway runs a separate ZooKeeper instance that stores a **full replica** of the whole file-system metadata.
+2. use a persistent write-back cache to avoid the long latency of a cloud accesses.
+
+
+
+
+- Metadata Management
+Kurma gateway maintains a replica of the entire file-system metadata in a local ZooKeeper instance.
+> metadata operations can be processed without synchronizing with other gateways.
+> store the metadata in-memory to achieve fast operations.
+
+Minimizing the memory footprint using three stategies:
+1. use a large block size, reduce the amount of blocks, and improve the throughput.
+2. compresses its metadata, via data locality.
+3. only store version number and GatewayID in gateway, and other metadata in cloud.
+
+
+- Security
+Kurma stores each encrypted block as a key-value object:
+> key is derived from the block's metadata.
+> ensure that each version of a block hash a unique cloud object key
+> attacker cannot tell whether two objects are two versions of the same block.
+
+
+
+
+
+- Multi-Cloud
+1. Replication
+2. Erasure Coding
+3. Secret Sharing
+
+- File Sharing Across Gateways
+close-to-open consistency: when a client opens a file, the client sees all changes made by other clients in the same region who closed the file before the open.
+
+- Persistent Caching
+Kurma NFS server has a persistent cache so that hot data can be read in the low-latency on-premises network instead of from remote clouds.
+
+
+### Implementation and Evaluation
+- Implementation:
+Kurma NFS Server: 15800 lines C++
+Gateway Server: 22700 lines java
+**Optimization**:
+> 1. using a separate thread to pre-compute a pool of *keymap*s (RSA keymap), so that Kurma can quickly take one *keymap* out of the pool when creating a file.
+> 2. batch multiple ZooKeeper changes into a single ZooKeeper transcation.
+> 3. Latencies of clouds vary significantly over time: Kurma sorts cloud providers in every $N$ seconds, and uses the $K$ fastest clouds as backends.
+
+- Evaluation
+Compare with a baseline using a single-node NFS server.
+> Metadata operation, data operations.
+
+## 2. Strength (Contributions of the paper)
+1. This paper proposes a secure geo-distributed multi-cloud storage gateway system, which keeps file-system metadata in trusted gateways, and stores the encryted data block in back-end clouds.
+> a geo-distributed file system.
+
+2. To solve the issue of metadata freshness, Kurma embeds a version number and a timestamp into each file block to ensure data freshness.
+
+
+## 3. Weakness (Limitations of the paper)
+1. Network paritions may disrupt the replication of version numbers among gateways. This can make Kurma cannot guarantee that a client always see all updates made by clients in other regios.
+
+
+## 4. Future Works
+1. this paper shows a new architecture of network storage system, the key is let a trusted gateway to storage the metadata.
2. Since this paper is related to multi-clouds scenario, there arises a natual issue, that is cost efficiency, this paper does not investigate this problem.
\ No newline at end of file
diff --git a/StoragePaperNote/Storage System/Multi-Cloud/SPANStore-SOSP'13.md b/StoragePaperNote/Storage System/Multi-Cloud/SPANStore-SOSP'13.md
old mode 100644
new mode 100755
index 33078b3..ac7831c
--- a/StoragePaperNote/Storage System/Multi-Cloud/SPANStore-SOSP'13.md
+++ b/StoragePaperNote/Storage System/Multi-Cloud/SPANStore-SOSP'13.md
@@ -1,69 +1,69 @@
----
-typora-copy-images-to: paper_figure
----
-SPANStore: Cost-Effective Geo-Replicated Storage Spanning Multiple Cloud Services
-------------------------------------------
-@SOSP'13 @Multi-cloud
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-Existing problems:
-- Although alomst every storage service offers an isolated pool of storage in each of its data centers. leaving replication across data centers to applications.
-> e.g. the application needs to replicate data in each data center to ensure low latency access to it for users in different locations.
-
-- Cloud providers do not provide a centralized view of storage with **rich semantics**, every application needs to reason on its own about where and how to replicate data
-> to satisfy its latency goals and consistency requirments at low cost.
-
-This paper designs and implements a key-value store that presents a **unified** viewe of storage services in several geographically distributed data centers. Its goal is to minimize the cost incurred by latency-sensitive application providers.
-
-### SPANStore
-SPANStore: span the data centers of multiple cloud service providers.
-- The key ideas of SPANStore:
-1. SPANStore exploits the **pricing discrepancies** to drive down the cost incurred in satisfying application providers' latency, consistency, and fault tolerance goals.
-2. SPANStore determines where to replicate every data object and how to perform this replication.
-> how to do the trade-off between (latency vs. storage costs and expenses)
-
-3. SPANStore ensures all data is largely exchanged directly between application virtual machines and the storage services that SPANStore builds upon.
-
-- The goal of SPANStore:
- 1) Minimize cost
- 2) Respect latency SLOs (service level objectives)
- 3) Flexible consistency (strong consistency, eventual consistency)
- 4) Tolerate failures
-- Technique 1: Harness multiple clouds
-> **SPANStore Architecture**:
-> 
-the application issues PUT and GET requests for objects to a SPANStore library that the application links to.
-> 
-Placement Manager collects a summary of the application's workload and latencies from remote data centers in each epoch. And then computes the replication policies.
-> **Cost Model**: Storage cost + Request Cost + Data transfer cost = Storage service cost
-
-- Technique 2: Aggregate workload prediction per access set
-> Determine replication policy for all objects with same **access set**, consider two factors:
->> application requirements
->> workload properties
-> Leverage application knowledge of sharing pattern (Dropbox/Google Doc know users that share a file)
-
-- Technique 3: Relay propagation
-> capitalize on the discrepancies in pricing across different cloud services and relay updates to the replicas via another data center that has cheaper pricing for upload bandwidth.
-
-### Implementation and Evaluation
-- Implementation:
-> 1) PMan
-> 2) a client library that applications can link to
-> 3) an XMLRPC server that is run in every VM run by SPANStore
-> 4) a memcached cluster to store in-memory metadata
-
-- Evaluation
-Setting: SPANStore is deployed across S3, Azure and GCS
-Simluation:
-> evaluate the cost savings
-> verify application requirements
-
-## 2. Strength (Contributions of the paper)
-- This paper launches a series of experiments to prove using multiple cloud providers can enable SPANStore offers lower GET/PUT latencies and costs.
-- It also the dynamic workload, and solves it by dividing time into different epoch, and determine the strategy depending on the workload in previous epoch.
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+SPANStore: Cost-Effective Geo-Replicated Storage Spanning Multiple Cloud Services
+------------------------------------------
+@SOSP'13 @Multi-cloud
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+Existing problems:
+- Although alomst every storage service offers an isolated pool of storage in each of its data centers. leaving replication across data centers to applications.
+> e.g. the application needs to replicate data in each data center to ensure low latency access to it for users in different locations.
+
+- Cloud providers do not provide a centralized view of storage with **rich semantics**, every application needs to reason on its own about where and how to replicate data
+> to satisfy its latency goals and consistency requirments at low cost.
+
+This paper designs and implements a key-value store that presents a **unified** viewe of storage services in several geographically distributed data centers. Its goal is to minimize the cost incurred by latency-sensitive application providers.
+
+### SPANStore
+SPANStore: span the data centers of multiple cloud service providers.
+- The key ideas of SPANStore:
+1. SPANStore exploits the **pricing discrepancies** to drive down the cost incurred in satisfying application providers' latency, consistency, and fault tolerance goals.
+2. SPANStore determines where to replicate every data object and how to perform this replication.
+> how to do the trade-off between (latency vs. storage costs and expenses)
+
+3. SPANStore ensures all data is largely exchanged directly between application virtual machines and the storage services that SPANStore builds upon.
+
+- The goal of SPANStore:
+ 1) Minimize cost
+ 2) Respect latency SLOs (service level objectives)
+ 3) Flexible consistency (strong consistency, eventual consistency)
+ 4) Tolerate failures
+- Technique 1: Harness multiple clouds
+> **SPANStore Architecture**:
+> 
+the application issues PUT and GET requests for objects to a SPANStore library that the application links to.
+> 
+Placement Manager collects a summary of the application's workload and latencies from remote data centers in each epoch. And then computes the replication policies.
+> **Cost Model**: Storage cost + Request Cost + Data transfer cost = Storage service cost
+
+- Technique 2: Aggregate workload prediction per access set
+> Determine replication policy for all objects with same **access set**, consider two factors:
+>> application requirements
+>> workload properties
+> Leverage application knowledge of sharing pattern (Dropbox/Google Doc know users that share a file)
+
+- Technique 3: Relay propagation
+> capitalize on the discrepancies in pricing across different cloud services and relay updates to the replicas via another data center that has cheaper pricing for upload bandwidth.
+
+### Implementation and Evaluation
+- Implementation:
+> 1) PMan
+> 2) a client library that applications can link to
+> 3) an XMLRPC server that is run in every VM run by SPANStore
+> 4) a memcached cluster to store in-memory metadata
+
+- Evaluation
+Setting: SPANStore is deployed across S3, Azure and GCS
+Simluation:
+> evaluate the cost savings
+> verify application requirements
+
+## 2. Strength (Contributions of the paper)
+- This paper launches a series of experiments to prove using multiple cloud providers can enable SPANStore offers lower GET/PUT latencies and costs.
+- It also the dynamic workload, and solves it by dividing time into different epoch, and determine the strategy depending on the workload in previous epoch.
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/Survey/Deduplcation-Survey'18.md b/StoragePaperNote/Survey/Deduplcation-Survey'18.md
old mode 100644
new mode 100755
index a0a9232..0296de9
--- a/StoragePaperNote/Survey/Deduplcation-Survey'18.md
+++ b/StoragePaperNote/Survey/Deduplcation-Survey'18.md
@@ -1,24 +1,24 @@
----
-typora-copy-images-to: paper_figure
----
-A Comprehensive Study of the Past, Present, and Future of Data Deduplication
-------------------------------------------
-@Survey
-[TOC]
-
-## 1. Summary
-### Motivation of this paper:
-
-
-
-### Name of the Work
-
-
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: paper_figure
+---
+A Comprehensive Study of the Past, Present, and Future of Data Deduplication
+------------------------------------------
+@Survey
+[TOC]
+
+## 1. Summary
+### Motivation of this paper:
+
+
+
+### Name of the Work
+
+
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/UnderstandingDedup-SINA'08.md b/StoragePaperNote/UnderstandingDedup-SINA'08.md
old mode 100644
new mode 100755
index d14893a..3421fce
--- a/StoragePaperNote/UnderstandingDedup-SINA'08.md
+++ b/StoragePaperNote/UnderstandingDedup-SINA'08.md
@@ -1,37 +1,37 @@
----
-typora-copy-images-to: ../paper_figure
----
-Understanding Data Deduplication Ratios
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| SINA'08 | Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper will explore the significance of deduplication ratios related to specific capacity optimization techniques within the context of information lifecycle management.
->1. cost saving
->2. risk reduction
->3. process improvement
-
-### Multiple technologies
-- Single instance storage (SIS)
-the replacement of duplicate files or objects with references to a shared copy.
-
-- Data deduplication
-examining a data set or byte stream at sub-file level and storing and /or sending only unique data.
-> the key factor distinguishing data deduplication from SIS is that data is shared at a **sub-file** level.
-
-- Compression
-encoding of data to reduce its storage requirement. Lossless data compression methods allow the exact original data to be reconstructed from the compressed data.
-
-- Copy on write and pointer remapping
-create changed block point in time copies.
-
-- Thin provisioning
-the transparent allocation of physical storage space for data when it is written ("just in time") rather than in advance of anticipated consumption.
-
-
-
-## 4. Future Works
+---
+typora-copy-images-to: ../paper_figure
+---
+Understanding Data Deduplication Ratios
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| SINA'08 | Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper will explore the significance of deduplication ratios related to specific capacity optimization techniques within the context of information lifecycle management.
+>1. cost saving
+>2. risk reduction
+>3. process improvement
+
+### Multiple technologies
+- Single instance storage (SIS)
+the replacement of duplicate files or objects with references to a shared copy.
+
+- Data deduplication
+examining a data set or byte stream at sub-file level and storing and /or sending only unique data.
+> the key factor distinguishing data deduplication from SIS is that data is shared at a **sub-file** level.
+
+- Compression
+encoding of data to reduce its storage requirement. Lossless data compression methods allow the exact original data to be reconstructed from the compressed data.
+
+- Copy on write and pointer remapping
+create changed block point in time copies.
+
+- Thin provisioning
+the transparent allocation of physical storage space for data when it is written ("just in time") rather than in advance of anticipated consumption.
+
+
+
+## 4. Future Works
diff --git a/StoragePaperNote/Useful Matrial b/StoragePaperNote/Useful Matrial
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/WeakLeakageResilient-ASIACCS'13.md b/StoragePaperNote/WeakLeakageResilient-ASIACCS'13.md
old mode 100644
new mode 100755
index 73b361c..3d5cb72
--- a/StoragePaperNote/WeakLeakageResilient-ASIACCS'13.md
+++ b/StoragePaperNote/WeakLeakageResilient-ASIACCS'13.md
@@ -1,32 +1,32 @@
----
-typora-copy-images-to: ../paper_figure
----
-Weak Leakage-Resilient Client-side Deduplication of Encrypted Data in Cloud Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| ASIA CCS'13 | Secure Deduplication |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-This paper intends to improve the PoW in CCS'11. It presents a secure client-side deduplication scheme, with the following advantages:
-> Its scheme protects data confidentiality against both outside adversaries and honest-but-curious cloud storage server. (PoW CCS'11 trusts cloud storage server in data confidentiality)
-> secure and distribution with sufficient min-entropy. (PoW CCS'11 is particular to a specific type of distribution) of input files.
-
-
-Current client-side deduplication: hash-as-a-proof method. The hash value of a file has two proposes:
-> 1. It is an index of file.
-> 2. it treats as a "proof" that the user owns file $F$.
-
-
-
-### Method Name
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Future Works
+---
+typora-copy-images-to: ../paper_figure
+---
+Weak Leakage-Resilient Client-side Deduplication of Encrypted Data in Cloud Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| ASIA CCS'13 | Secure Deduplication |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+This paper intends to improve the PoW in CCS'11. It presents a secure client-side deduplication scheme, with the following advantages:
+> Its scheme protects data confidentiality against both outside adversaries and honest-but-curious cloud storage server. (PoW CCS'11 trusts cloud storage server in data confidentiality)
+> secure and distribution with sufficient min-entropy. (PoW CCS'11 is particular to a specific type of distribution) of input files.
+
+
+Current client-side deduplication: hash-as-a-proof method. The hash value of a file has two proposes:
+> 1. It is an index of file.
+> 2. it treats as a "proof" that the user owns file $F$.
+
+
+
+### Method Name
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Future Works
diff --git a/StoragePaperNote/desktop.ini b/StoragePaperNote/desktop.ini
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/iDedup-FAST'12.md b/StoragePaperNote/iDedup-FAST'12.md
old mode 100644
new mode 100755
index c81d8c3..6770376
--- a/StoragePaperNote/iDedup-FAST'12.md
+++ b/StoragePaperNote/iDedup-FAST'12.md
@@ -1,120 +1,120 @@
----
-typora-copy-images-to: ../paper_figure
----
-iDedup: Latency-aware, Inline Data Deduplication for Primary Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| FAST'12 | Deduplication System |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-- Motivation
-Many primary storage workloads are unable to leverage the benefits of deduplication
-> due to the associated latency costs.
-
-Prior research has not applied deduplication techniques **inline** to the request path for **latency sensitive**, **primary workloads**.
-> inline deduplication: add work to the write path, increase latency
-> offline deduplication: wait for system idle time to do deduplication.
-> reads remain fragmented in both.
-
-- Disadvantages of offline deduplication
- - cause a bloat in storage usage leading to inaccurate space accounting and provisioning
- - need system idle time to perform deduplication without impacting foreground requests.
- - offline deduplication uses extra disk bandwidth when reading in the staged data.
-
-- Current workloads have two insights:
-> 1. spatial locality
-> 2. temporal locality
-
-Key question: how to do the tradeoff between capacity savings and deduplication performance?
-
-### iDedup
-- Goal: not increase the latency of the already latency sensitive, foreground operations.
-1. read operation: fragmentation in data layout.
-2. write operation: to identify duplicates, on-disk data structures are accessed.
-
-- Main idea
-1. Amortize the seeks caused by deduplication by only performing deduplication when a sequence of on-disk blocks are duplicated.
-> examine blocks at write time
-> configure a *minimum sequence length*
-> tradeoff: capacity savings and performance
-
-2. maintain an in-memory fingerprint cache to detect duplicates in lieu of any on-disk structures.
-> a completely memory-resident, LRU cache.
-> tradeoff: performance (hit rate) and capacity savings (dedup-metadata size)
-
-- Design rationale
-1. *Spatial locality* in the data workloads
-Duplicated data is clustered.
-
-2. *Temporal locality* in the data workloads
-making the fingerprint table amenable to caching
-
-
-- System Architecture
-
-
-
-1. Cache design
-One entry per block.
-> maps the fingerprint of a block to its disk block number (DBN) on disk.
-> use LRU policy, (fingerprint, DBN)
-
-2. Metadata management
-In RAM:
-> Dedup-metadata cache: a pool of block entries (content-nodes)
-> Fingerprint hash table: maps fingerprint to DBN
-> DBN hash table: map DBN to its content-node.
-
-In disk
-> Reference count file: maintains reference counts of deduplicated file system blocks in a file.
->
-> > refcount updates are often collocated to the same disk blocks (thereby amortizing IOs to the refcount file)
-
-3. iDedup algorithm: Sequence identification
-
-
-
-### Implementation and Evaluation
-
-
-- Evaluation
-Two tunable parameters:
-> 1. the minimum duplicate sequence threshold
-> 2. in-memory dedup-metadata cache size
-
-Two comparisons:
-1. baseline: without deduplication
-2. threshold = 1: exact deduplication
-
-1. Deduplication ratio vs. threshold
-threshold increases, the deduplication ratio drops
-2. Disk fragmentation vs. threshold
-threshold increases, fragmentation decreases
-3. client read response time vs. threshold
-same trend as disk fragmentation
-4. CPU utilization vs. threshold
-utilization increases slightly with the threshold
-iDedup algorithm has little impacts on the overall utilization
-5. Buffer cache hit rate vs. dedup-metadata cache size
-
-
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-1. This paper provides the insights on spatial and temporal locality of deduplicated data in real-world, primary workloads.
-
-
-## 4. Some Insights (Future work)
-1. This paper mentions that the higher the deduplication ratio, the higher the likelihood of fragmentation.
-> deduplication can convert sequential reads from the application into random reads from storage.
-
-2. It mentions the threshold in iDedup must be derived empirically to match the randomness in the workload.
-> depends on the workload property
-> how to enable the system to automatically make this tradeoff.
-
-3. primary storage system trace
+---
+typora-copy-images-to: ../paper_figure
+---
+iDedup: Latency-aware, Inline Data Deduplication for Primary Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| FAST'12 | Deduplication System |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+- Motivation
+Many primary storage workloads are unable to leverage the benefits of deduplication
+> due to the associated latency costs.
+
+Prior research has not applied deduplication techniques **inline** to the request path for **latency sensitive**, **primary workloads**.
+> inline deduplication: add work to the write path, increase latency
+> offline deduplication: wait for system idle time to do deduplication.
+> reads remain fragmented in both.
+
+- Disadvantages of offline deduplication
+ - cause a bloat in storage usage leading to inaccurate space accounting and provisioning
+ - need system idle time to perform deduplication without impacting foreground requests.
+ - offline deduplication uses extra disk bandwidth when reading in the staged data.
+
+- Current workloads have two insights:
+> 1. spatial locality
+> 2. temporal locality
+
+Key question: how to do the tradeoff between capacity savings and deduplication performance?
+
+### iDedup
+- Goal: not increase the latency of the already latency sensitive, foreground operations.
+1. read operation: fragmentation in data layout.
+2. write operation: to identify duplicates, on-disk data structures are accessed.
+
+- Main idea
+1. Amortize the seeks caused by deduplication by only performing deduplication when a sequence of on-disk blocks are duplicated.
+> examine blocks at write time
+> configure a *minimum sequence length*
+> tradeoff: capacity savings and performance
+
+2. maintain an in-memory fingerprint cache to detect duplicates in lieu of any on-disk structures.
+> a completely memory-resident, LRU cache.
+> tradeoff: performance (hit rate) and capacity savings (dedup-metadata size)
+
+- Design rationale
+1. *Spatial locality* in the data workloads
+Duplicated data is clustered.
+
+2. *Temporal locality* in the data workloads
+making the fingerprint table amenable to caching
+
+
+- System Architecture
+
+
+
+1. Cache design
+One entry per block.
+> maps the fingerprint of a block to its disk block number (DBN) on disk.
+> use LRU policy, (fingerprint, DBN)
+
+2. Metadata management
+In RAM:
+> Dedup-metadata cache: a pool of block entries (content-nodes)
+> Fingerprint hash table: maps fingerprint to DBN
+> DBN hash table: map DBN to its content-node.
+
+In disk
+> Reference count file: maintains reference counts of deduplicated file system blocks in a file.
+>
+> > refcount updates are often collocated to the same disk blocks (thereby amortizing IOs to the refcount file)
+
+3. iDedup algorithm: Sequence identification
+
+
+
+### Implementation and Evaluation
+
+
+- Evaluation
+Two tunable parameters:
+> 1. the minimum duplicate sequence threshold
+> 2. in-memory dedup-metadata cache size
+
+Two comparisons:
+1. baseline: without deduplication
+2. threshold = 1: exact deduplication
+
+1. Deduplication ratio vs. threshold
+threshold increases, the deduplication ratio drops
+2. Disk fragmentation vs. threshold
+threshold increases, fragmentation decreases
+3. client read response time vs. threshold
+same trend as disk fragmentation
+4. CPU utilization vs. threshold
+utilization increases slightly with the threshold
+iDedup algorithm has little impacts on the overall utilization
+5. Buffer cache hit rate vs. dedup-metadata cache size
+
+
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+1. This paper provides the insights on spatial and temporal locality of deduplicated data in real-world, primary workloads.
+
+
+## 4. Some Insights (Future work)
+1. This paper mentions that the higher the deduplication ratio, the higher the likelihood of fragmentation.
+> deduplication can convert sequential reads from the application into random reads from storage.
+
+2. It mentions the threshold in iDedup must be derived empirically to match the randomness in the workload.
+> depends on the workload property
+> how to enable the system to automatically make this tradeoff.
+
+3. primary storage system trace
CIFS traces: NetApp (USENIX ATC'08)
\ No newline at end of file
diff --git a/StoragePaperNote/paper_figure/1522721106458.png b/StoragePaperNote/paper_figure/1522721106458.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1522938283913.png b/StoragePaperNote/paper_figure/1522938283913.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1523241524030.png b/StoragePaperNote/paper_figure/1523241524030.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1523329925719.png b/StoragePaperNote/paper_figure/1523329925719.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1526027787768.png b/StoragePaperNote/paper_figure/1526027787768.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1526305327024.png b/StoragePaperNote/paper_figure/1526305327024.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1526389148656.png b/StoragePaperNote/paper_figure/1526389148656.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1531322452717.png b/StoragePaperNote/paper_figure/1531322452717.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1531501985565.png b/StoragePaperNote/paper_figure/1531501985565.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533545264630.png b/StoragePaperNote/paper_figure/1533545264630.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533558772424.png b/StoragePaperNote/paper_figure/1533558772424.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533559666806.png b/StoragePaperNote/paper_figure/1533559666806.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533632589963.png b/StoragePaperNote/paper_figure/1533632589963.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533635551383.png b/StoragePaperNote/paper_figure/1533635551383.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533713900014.png b/StoragePaperNote/paper_figure/1533713900014.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533716802051.png b/StoragePaperNote/paper_figure/1533716802051.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533785674947.png b/StoragePaperNote/paper_figure/1533785674947.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533801963837.png b/StoragePaperNote/paper_figure/1533801963837.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533803970846.png b/StoragePaperNote/paper_figure/1533803970846.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533832651101.png b/StoragePaperNote/paper_figure/1533832651101.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533909218190.png b/StoragePaperNote/paper_figure/1533909218190.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533978994514.png b/StoragePaperNote/paper_figure/1533978994514.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533991845908.png b/StoragePaperNote/paper_figure/1533991845908.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1533992512318.png b/StoragePaperNote/paper_figure/1533992512318.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534147681755.png b/StoragePaperNote/paper_figure/1534147681755.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534231184704.png b/StoragePaperNote/paper_figure/1534231184704.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534231209288.png b/StoragePaperNote/paper_figure/1534231209288.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534322692743.png b/StoragePaperNote/paper_figure/1534322692743.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534324237488.png b/StoragePaperNote/paper_figure/1534324237488.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534491739567.png b/StoragePaperNote/paper_figure/1534491739567.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534491996294.png b/StoragePaperNote/paper_figure/1534491996294.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534492052150.png b/StoragePaperNote/paper_figure/1534492052150.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534492144193.png b/StoragePaperNote/paper_figure/1534492144193.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534493095149.png b/StoragePaperNote/paper_figure/1534493095149.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534493102024.png b/StoragePaperNote/paper_figure/1534493102024.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534514760021.png b/StoragePaperNote/paper_figure/1534514760021.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534820931384.png b/StoragePaperNote/paper_figure/1534820931384.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534843008014.png b/StoragePaperNote/paper_figure/1534843008014.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534859524747.png b/StoragePaperNote/paper_figure/1534859524747.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534927153524.png b/StoragePaperNote/paper_figure/1534927153524.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1534941877505.png b/StoragePaperNote/paper_figure/1534941877505.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535025690952.png b/StoragePaperNote/paper_figure/1535025690952.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535080941866.png b/StoragePaperNote/paper_figure/1535080941866.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535118132918.png b/StoragePaperNote/paper_figure/1535118132918.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535424966773.png b/StoragePaperNote/paper_figure/1535424966773.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535509143693.png b/StoragePaperNote/paper_figure/1535509143693.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535529964676.png b/StoragePaperNote/paper_figure/1535529964676.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535530049894.png b/StoragePaperNote/paper_figure/1535530049894.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535630992315.png b/StoragePaperNote/paper_figure/1535630992315.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1535718654925.png b/StoragePaperNote/paper_figure/1535718654925.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1536304812187.png b/StoragePaperNote/paper_figure/1536304812187.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1536304821166.png b/StoragePaperNote/paper_figure/1536304821166.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1536311837927.png b/StoragePaperNote/paper_figure/1536311837927.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1536314355540.png b/StoragePaperNote/paper_figure/1536314355540.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1536411514566.png b/StoragePaperNote/paper_figure/1536411514566.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1536422986790.png b/StoragePaperNote/paper_figure/1536422986790.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537101426419.png b/StoragePaperNote/paper_figure/1537101426419.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537113167500.png b/StoragePaperNote/paper_figure/1537113167500.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537156395887.png b/StoragePaperNote/paper_figure/1537156395887.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537170968620.png b/StoragePaperNote/paper_figure/1537170968620.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537502009449.png b/StoragePaperNote/paper_figure/1537502009449.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537519750596.png b/StoragePaperNote/paper_figure/1537519750596.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537523583894.png b/StoragePaperNote/paper_figure/1537523583894.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537620570242.png b/StoragePaperNote/paper_figure/1537620570242.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537856398909.png b/StoragePaperNote/paper_figure/1537856398909.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1537856407481.png b/StoragePaperNote/paper_figure/1537856407481.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1538214740187.png b/StoragePaperNote/paper_figure/1538214740187.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1538237284824.png b/StoragePaperNote/paper_figure/1538237284824.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1538364574274.png b/StoragePaperNote/paper_figure/1538364574274.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1539263939506.png b/StoragePaperNote/paper_figure/1539263939506.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1539311866259.png b/StoragePaperNote/paper_figure/1539311866259.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1546917876953.png b/StoragePaperNote/paper_figure/1546917876953.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1546918149839.png b/StoragePaperNote/paper_figure/1546918149839.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1546932031713.png b/StoragePaperNote/paper_figure/1546932031713.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1546937172649.png b/StoragePaperNote/paper_figure/1546937172649.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1546937279046.png b/StoragePaperNote/paper_figure/1546937279046.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1547042229954.png b/StoragePaperNote/paper_figure/1547042229954.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1547437042911.png b/StoragePaperNote/paper_figure/1547437042911.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1547437239206.png b/StoragePaperNote/paper_figure/1547437239206.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1547449080194.png b/StoragePaperNote/paper_figure/1547449080194.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1547719418272.png b/StoragePaperNote/paper_figure/1547719418272.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1547724806425.png b/StoragePaperNote/paper_figure/1547724806425.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1548039339341.png b/StoragePaperNote/paper_figure/1548039339341.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1550132357894.png b/StoragePaperNote/paper_figure/1550132357894.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1550132390502.png b/StoragePaperNote/paper_figure/1550132390502.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1550133510605.png b/StoragePaperNote/paper_figure/1550133510605.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1550145505105.png b/StoragePaperNote/paper_figure/1550145505105.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1550738798721.png b/StoragePaperNote/paper_figure/1550738798721.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1551686443341.png b/StoragePaperNote/paper_figure/1551686443341.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1551708715146.png b/StoragePaperNote/paper_figure/1551708715146.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1551708732566.png b/StoragePaperNote/paper_figure/1551708732566.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1552535525593.png b/StoragePaperNote/paper_figure/1552535525593.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1552545363095.png b/StoragePaperNote/paper_figure/1552545363095.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1552878709205.png b/StoragePaperNote/paper_figure/1552878709205.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1552893983832.png b/StoragePaperNote/paper_figure/1552893983832.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1552911506553.png b/StoragePaperNote/paper_figure/1552911506553.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1552912503121.png b/StoragePaperNote/paper_figure/1552912503121.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1553325398564.png b/StoragePaperNote/paper_figure/1553325398564.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1553325419379.png b/StoragePaperNote/paper_figure/1553325419379.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1553335463524.png b/StoragePaperNote/paper_figure/1553335463524.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1553335487753.png b/StoragePaperNote/paper_figure/1553335487753.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1553497729383.png b/StoragePaperNote/paper_figure/1553497729383.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1553516953870.png b/StoragePaperNote/paper_figure/1553516953870.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1554376060947.png b/StoragePaperNote/paper_figure/1554376060947.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1554865684235.png b/StoragePaperNote/paper_figure/1554865684235.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1555763466457.png b/StoragePaperNote/paper_figure/1555763466457.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1555765487674.png b/StoragePaperNote/paper_figure/1555765487674.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1555769164180.png b/StoragePaperNote/paper_figure/1555769164180.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1555770525391.png b/StoragePaperNote/paper_figure/1555770525391.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1556250344815.png b/StoragePaperNote/paper_figure/1556250344815.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1556250623415.png b/StoragePaperNote/paper_figure/1556250623415.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1556280764754.png b/StoragePaperNote/paper_figure/1556280764754.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1557370885604.png b/StoragePaperNote/paper_figure/1557370885604.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1557642537365.png b/StoragePaperNote/paper_figure/1557642537365.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1557673345371.png b/StoragePaperNote/paper_figure/1557673345371.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1557997523898.png b/StoragePaperNote/paper_figure/1557997523898.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1557997548672.png b/StoragePaperNote/paper_figure/1557997548672.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558079145575.png b/StoragePaperNote/paper_figure/1558079145575.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558494722437.png b/StoragePaperNote/paper_figure/1558494722437.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558513853943.png b/StoragePaperNote/paper_figure/1558513853943.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558519202625.png b/StoragePaperNote/paper_figure/1558519202625.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558614526597.png b/StoragePaperNote/paper_figure/1558614526597.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558614554155.png b/StoragePaperNote/paper_figure/1558614554155.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558623033915.png b/StoragePaperNote/paper_figure/1558623033915.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558962484711.png b/StoragePaperNote/paper_figure/1558962484711.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1558962790911.png b/StoragePaperNote/paper_figure/1558962790911.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1559037152377.png b/StoragePaperNote/paper_figure/1559037152377.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1560441378608.png b/StoragePaperNote/paper_figure/1560441378608.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1562405070274.png b/StoragePaperNote/paper_figure/1562405070274.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1564405750463.png b/StoragePaperNote/paper_figure/1564405750463.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/1564457932100.png b/StoragePaperNote/paper_figure/1564457932100.png
old mode 100644
new mode 100755
diff --git a/StoragePaperNote/paper_figure/image-20201029020046029.png b/StoragePaperNote/paper_figure/image-20201029020046029.png
new file mode 100755
index 0000000..4b727ff
Binary files /dev/null and b/StoragePaperNote/paper_figure/image-20201029020046029.png differ
diff --git a/StoragePaperNote/template.md b/StoragePaperNote/template.md
old mode 100644
new mode 100755
index d885e0d..c88408b
--- a/StoragePaperNote/template.md
+++ b/StoragePaperNote/template.md
@@ -1,23 +1,23 @@
----
-typora-copy-images-to: ../paper_figure
----
-GoSeed: Generating an Optimal Seeding Plan for Deduplicated Storage
-------------------------------------------
-| Venue | Category |
-| :------------------------: | :------------------: |
-| UCC'19 | Data Encryption |
-[TOC]
-
-## 1. Summary
-### Motivation of this paper
-
-### Method Name
-
-### Implementation and Evaluation
-
-## 2. Strength (Contributions of the paper)
-
-## 3. Weakness (Limitations of the paper)
-
-## 4. Some Insights (Future work)
-
+---
+typora-copy-images-to: ../paper_figure
+---
+GoSeed: Generating an Optimal Seeding Plan for Deduplicated Storage
+------------------------------------------
+| Venue | Category |
+| :------------------------: | :------------------: |
+| UCC'19 | Data Encryption |
+[TOC]
+
+## 1. Summary
+### Motivation of this paper
+
+### Method Name
+
+### Implementation and Evaluation
+
+## 2. Strength (Contributions of the paper)
+
+## 3. Weakness (Limitations of the paper)
+
+## 4. Some Insights (Future work)
+
diff --git a/paper_figure/1559037213718.png b/paper_figure/1559037213718.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559037217373.png b/paper_figure/1559037217373.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559318184036.png b/paper_figure/1559318184036.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559318207399.png b/paper_figure/1559318207399.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559372865123.png b/paper_figure/1559372865123.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559378652026.png b/paper_figure/1559378652026.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559549453140.png b/paper_figure/1559549453140.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559551516676.png b/paper_figure/1559551516676.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559566271088.png b/paper_figure/1559566271088.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1559919212428.png b/paper_figure/1559919212428.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560000228134.png b/paper_figure/1560000228134.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560135727490.png b/paper_figure/1560135727490.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560174931404.png b/paper_figure/1560174931404.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560175081156.png b/paper_figure/1560175081156.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560263818057.png b/paper_figure/1560263818057.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560263846164.png b/paper_figure/1560263846164.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560321278256.png b/paper_figure/1560321278256.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560321327144.png b/paper_figure/1560321327144.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560321404408.png b/paper_figure/1560321404408.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560328097317.png b/paper_figure/1560328097317.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560328134020.png b/paper_figure/1560328134020.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560329084025.png b/paper_figure/1560329084025.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560668911327.png b/paper_figure/1560668911327.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560672462328.png b/paper_figure/1560672462328.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560684390577.png b/paper_figure/1560684390577.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560764288518.png b/paper_figure/1560764288518.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1560779509995.png b/paper_figure/1560779509995.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561021159924.png b/paper_figure/1561021159924.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561195550827.png b/paper_figure/1561195550827.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561365990754.png b/paper_figure/1561365990754.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561383117771.png b/paper_figure/1561383117771.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561383159912.png b/paper_figure/1561383159912.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561385818759.png b/paper_figure/1561385818759.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561621592254.png b/paper_figure/1561621592254.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561707845616.png b/paper_figure/1561707845616.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561708816690.png b/paper_figure/1561708816690.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561714450649.png b/paper_figure/1561714450649.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561970185538.png b/paper_figure/1561970185538.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1561973214857.png b/paper_figure/1561973214857.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1562167485203.png b/paper_figure/1562167485203.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1562229396844.png b/paper_figure/1562229396844.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1562232295413.png b/paper_figure/1562232295413.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1562591190121.png b/paper_figure/1562591190121.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1562596227030.png b/paper_figure/1562596227030.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1564909153784.png b/paper_figure/1564909153784.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1565180832447.png b/paper_figure/1565180832447.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1565186648789.png b/paper_figure/1565186648789.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1565186677190.png b/paper_figure/1565186677190.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1565942057872.png b/paper_figure/1565942057872.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1565946348305.png b/paper_figure/1565946348305.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1568705054880.png b/paper_figure/1568705054880.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1569071462061.png b/paper_figure/1569071462061.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1569076126086.png b/paper_figure/1569076126086.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1569251146102.png b/paper_figure/1569251146102.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1570072565254.png b/paper_figure/1570072565254.png
old mode 100644
new mode 100755
diff --git a/paper_figure/1570362198248.png b/paper_figure/1570362198248.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191106114728286.png b/paper_figure/image-20191106114728286.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191118003954646.png b/paper_figure/image-20191118003954646.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191122113840773.png b/paper_figure/image-20191122113840773.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191122113917775.png b/paper_figure/image-20191122113917775.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191123134134016.png b/paper_figure/image-20191123134134016.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191124230639274.png b/paper_figure/image-20191124230639274.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191126100315199.png b/paper_figure/image-20191126100315199.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191127151115458.png b/paper_figure/image-20191127151115458.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191128142955079.png b/paper_figure/image-20191128142955079.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191128153143279.png b/paper_figure/image-20191128153143279.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191128160533652.png b/paper_figure/image-20191128160533652.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191205195055060.png b/paper_figure/image-20191205195055060.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191209162702042.png b/paper_figure/image-20191209162702042.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191217204932349.png b/paper_figure/image-20191217204932349.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191217205029441.png b/paper_figure/image-20191217205029441.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191223162054823.png b/paper_figure/image-20191223162054823.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191224214842931.png b/paper_figure/image-20191224214842931.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20191224214916387.png b/paper_figure/image-20191224214916387.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200130153405675.png b/paper_figure/image-20200130153405675.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200306224943511.png b/paper_figure/image-20200306224943511.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200308183445934.png b/paper_figure/image-20200308183445934.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200714012030378.png b/paper_figure/image-20200714012030378.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200714142649199.png b/paper_figure/image-20200714142649199.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200717161721279.png b/paper_figure/image-20200717161721279.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200717175301020.png b/paper_figure/image-20200717175301020.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200717223501471.png b/paper_figure/image-20200717223501471.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200718163048858.png b/paper_figure/image-20200718163048858.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200731155149662.png b/paper_figure/image-20200731155149662.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200914021803963.png b/paper_figure/image-20200914021803963.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200916145707212.png b/paper_figure/image-20200916145707212.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200916150327642.png b/paper_figure/image-20200916150327642.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200916152327699.png b/paper_figure/image-20200916152327699.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200918215222896.png b/paper_figure/image-20200918215222896.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200918215434370.png b/paper_figure/image-20200918215434370.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200918224123859.png b/paper_figure/image-20200918224123859.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200919014910600.png b/paper_figure/image-20200919014910600.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200923153618230.png b/paper_figure/image-20200923153618230.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20200923153649390.png b/paper_figure/image-20200923153649390.png
old mode 100644
new mode 100755
diff --git a/paper_figure/image-20201110121332322.png b/paper_figure/image-20201110121332322.png
new file mode 100755
index 0000000..2478867
Binary files /dev/null and b/paper_figure/image-20201110121332322.png differ
diff --git a/paper_figure/image-20201110154834949.png b/paper_figure/image-20201110154834949.png
new file mode 100755
index 0000000..bfc5dd0
Binary files /dev/null and b/paper_figure/image-20201110154834949.png differ
diff --git a/paper_figure/image-20201128184758932.png b/paper_figure/image-20201128184758932.png
new file mode 100755
index 0000000..1d2fede
Binary files /dev/null and b/paper_figure/image-20201128184758932.png differ
diff --git a/paper_figure/image-20201128194505744.png b/paper_figure/image-20201128194505744.png
new file mode 100755
index 0000000..4f67e3a
Binary files /dev/null and b/paper_figure/image-20201128194505744.png differ
diff --git a/paper_figure/image-20201220224404400.png b/paper_figure/image-20201220224404400.png
new file mode 100755
index 0000000..b35d2fc
Binary files /dev/null and b/paper_figure/image-20201220224404400.png differ
diff --git a/paper_figure/image-20201221014051897.png b/paper_figure/image-20201221014051897.png
new file mode 100755
index 0000000..35d1ddc
Binary files /dev/null and b/paper_figure/image-20201221014051897.png differ