Add in ability to download, add in hidden fields next to emoji for do…

…wnload to show values, add in missing framework support for iceberg and delta lake
pflooky · Nov 14, 2023 · c0f24b8 · c0f24b8
1 parent 48ace3e
commit c0f24b8
Show file tree

Hide file tree

Showing 15 changed files with 278 additions and 47 deletions.
diff --git a/docs/file/index.md b/docs/file/index.md
diff --git a/docs/javascripts/tableselect.js b/docs/javascripts/tableselect.js
@@ -23,11 +23,13 @@ document$.subscribe(function() {
     const tables = document.querySelectorAll("table");
     tables.forEach(function(table) {
         const dataTable = new DataTable(table, {
+            dom: "Bfrtip",
             ordering: false,
             paging: false,
             autoWidth: true,
-            fixedHeader: true
-        })
+            fixedHeader: true,
+            buttons: ['copy', 'csv', 'excel', 'pdf']
+        });
 
         document.querySelectorAll("a.toggle-vis").forEach((el) => {
             el.addEventListener('click', function (e) {

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -45,11 +45,17 @@ extra_javascript:
   - https://code.jquery.com/jquery-3.7.0.js
   - https://cdn.datatables.net/1.13.7/js/jquery.dataTables.min.js
   - https://cdn.datatables.net/fixedheader/3.4.0/js/dataTables.fixedHeader.min.js
+  - https://cdn.datatables.net/buttons/2.4.2/js/dataTables.buttons.min.js
+  - https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js
+  - https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/pdfmake.min.js
+  - https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/vfs_fonts.js
+  - https://cdn.datatables.net/buttons/2.4.2/js/buttons.html5.min.js
   - javascripts/tableselect.js
 
 extra_css:
   - https://cdn.datatables.net/1.13.7/css/jquery.dataTables.min.css
   - https://cdn.datatables.net/fixedheader/3.4.0/css/fixedHeader.dataTables.min.css
+  - https://cdn.datatables.net/buttons/2.4.2/css/buttons.dataTables.min.css
   - stylesheets/extra.css
 
 extra:

diff --git a/site/404.html b/site/404.html
@@ -49,6 +49,8 @@
 
       <link rel="stylesheet" href="https://cdn.datatables.net/fixedheader/3.4.0/css/fixedHeader.dataTables.min.css">
 
+      <link rel="stylesheet" href="https://cdn.datatables.net/buttons/2.4.2/css/buttons.dataTables.min.css">
+
       <link rel="stylesheet" href="/stylesheets/extra.css">
 
     <script>__md_scope=new URL("/",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
@@ -462,6 +464,16 @@ <h1>404 - Not found</h1>
 
         <script src="https://cdn.datatables.net/fixedheader/3.4.0/js/dataTables.fixedHeader.min.js"></script>
 
+        <script src="https://cdn.datatables.net/buttons/2.4.2/js/dataTables.buttons.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/pdfmake.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/vfs_fonts.js"></script>
+
+        <script src="https://cdn.datatables.net/buttons/2.4.2/js/buttons.html5.min.js"></script>
+
         <script src="/javascripts/tableselect.js"></script>
 
 

diff --git a/site/database/index.html b/site/database/index.html
@@ -51,6 +51,8 @@
 
       <link rel="stylesheet" href="https://cdn.datatables.net/fixedheader/3.4.0/css/fixedHeader.dataTables.min.css">
 
+      <link rel="stylesheet" href="https://cdn.datatables.net/buttons/2.4.2/css/buttons.dataTables.min.css">
+
       <link rel="stylesheet" href="../stylesheets/extra.css">
 
     <script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
@@ -483,6 +485,16 @@ <h1 id="databases">Databases</h1>
 
         <script src="https://cdn.datatables.net/fixedheader/3.4.0/js/dataTables.fixedHeader.min.js"></script>
 
+        <script src="https://cdn.datatables.net/buttons/2.4.2/js/dataTables.buttons.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/pdfmake.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/vfs_fonts.js"></script>
+
+        <script src="https://cdn.datatables.net/buttons/2.4.2/js/buttons.html5.min.js"></script>
+
         <script src="../javascripts/tableselect.js"></script>
 
 

diff --git a/site/file/index.html b/site/file/index.html
diff --git a/site/index.html b/site/index.html
@@ -53,6 +53,8 @@
 
       <link rel="stylesheet" href="https://cdn.datatables.net/fixedheader/3.4.0/css/fixedHeader.dataTables.min.css">
 
+      <link rel="stylesheet" href="https://cdn.datatables.net/buttons/2.4.2/css/buttons.dataTables.min.css">
+
       <link rel="stylesheet" href="stylesheets/extra.css">
 
     <script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
@@ -577,6 +579,16 @@ <h2 id="categories">Categories</h2>
 
         <script src="https://cdn.datatables.net/fixedheader/3.4.0/js/dataTables.fixedHeader.min.js"></script>
 
+        <script src="https://cdn.datatables.net/buttons/2.4.2/js/dataTables.buttons.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/jszip/3.10.1/jszip.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/pdfmake.min.js"></script>
+
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdfmake/0.1.53/vfs_fonts.js"></script>
+
+        <script src="https://cdn.datatables.net/buttons/2.4.2/js/buttons.html5.min.js"></script>
+
         <script src="javascripts/tableselect.js"></script>
 
 

diff --git a/site/javascripts/tableselect.js b/site/javascripts/tableselect.js
@@ -23,11 +23,13 @@ document$.subscribe(function() {
     const tables = document.querySelectorAll("table");
     tables.forEach(function(table) {
         const dataTable = new DataTable(table, {
+            dom: "Bfrtip",
             ordering: false,
             paging: false,
             autoWidth: true,
-            fixedHeader: true
-        })
+            fixedHeader: true,
+            buttons: ['copy', 'csv', 'excel', 'pdf']
+        });
 
         document.querySelectorAll("a.toggle-vis").forEach((el) => {
             el.addEventListener('click', function (e) {

diff --git a/site/search/search_index.json b/site/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Data Tech Compare","text":"<p>Compare all data related technologies with each other to find the best fit for you and your use case.</p>"},{"location":"#categories","title":"Categories","text":"<ul> <li> <p> Files</p> <p>CSV, Parquet, ORC, JSON, Avro, etc.</p> </li> </ul>"},{"location":"database/","title":"Databases","text":""},{"location":"file/","title":"File","text":"<p> Apache Avro  Apache Hudi  Apache Iceberg  Apache ORC  Apache Parquet  CSV  Delta Lake</p> Attribute Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Name Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Description Apache Avro is the leading serialization format for record data, and first choice for streaming data pipelines. Apache Hudi is a transactional data lake platform that brings database and data warehouse capabilities to the data lake. Utilises data stored in either parquet or orc. Iceberg is a high-performance format for huge analytic tables. Utilises data stored in either parquet, avro, or orc. ORC is a self-describing type-aware columnar file format designed for Hadoop workloads. Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. Comma-Separated Values (CSV) is a text file format that uses commas to separate values in plain text. Delta Lake is an open-source storage framework that enables building a Lakehouse architecture. License Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 N/A Apache license 2.0 Source code https://github.com/apache/avro https://github.com/apache/hudi https://github.com/apache/iceberg https://github.com/apache/orc https://github.com/apache/parquet-format https://github.com/delta-io/delta Website https://avro.apache.org/ https://hudi.apache.org/ https://iceberg.apache.org/ https://orc.apache.org/ https://parquet.apache.org/ https://www.rfc-editor.org/rfc/rfc4180.html https://delta.io/ Year created 2009 2016 2017 2013 2013 0 2019 Company Uber Netflix Hortonworks, Facebook Twitter, Cloudera Databricks Language support java, c++, c#, c, python, javascript, perl, ruby, php, rust java, scala, c++, python java, scala, c++, python, r, php java, scala, c++, python, r, php, go scala, java, python, rust Use cases Stream processing, Analytics, Efficient data exchange Incremental data processing, Data upserts, Change Data Capture (CDC), ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, Column based queries Write once read many, Analytics, Efficient storage, ACID transactions Is human readable Orientation row column or row column or row row column row column Has type system Has nested structure support Has native compression Has encoding support Has constraint support Has acid support Has metadata Has encryption support Data processing framework support Apache Flink,  Apache Gobblin,  Apache NiFi,  Apache Pig,  Apache Spark,  Apache Spark,  Apache Flink,  Apache Drill,  Apache Flink,  Apache Gobblin,  Apache Hadoop,  Apache NiFi,  Apache Pig,  Apache Spark,  Apache Beam,  Apache Drill,  Apache Flink,  Apache Spark,  Apache Beam,  Apache Drill,  Apache Flink,  Apache Gobblin,  Apache Hive,  Apache NiFi,  Apache Pig,  Apache Spark,  Apache Drill,  Analytics query support Apache Impala,  Apache Druid,  Apache Hive,  Apache Pinot,  AWS Athena,  BigQuery,  Clickhouse,  Firebolt,  Apache Hive,  Apache Impala,  AWS Athena,  BigQuery,  Clickhouse,  Presto,  Trino,  Apache Impala,  Apache Druid,  AWS Athena,  BigQuery,  Clickhouse,  Dremio,  DuckDB,  Presto,  Trino,  Apache Impala,  Apache Druid,  Apache Hive,  Apache Pinot,  AWS Athena,  BigQuery,  Clickhouse,  Firebolt,  Presto,  Trino,  Apache Hive,  Apache Impala,  Apache Druid,  Apache Pinot,  AWS Athena,  Azure Synapse,  BigQuery,  Clickhouse,  Dremio,  DuckDB,  Firebolt,  Apache Impala,  Apache Druid,  Apache Pinot,  AWS Athena,  Azure Synapse,  BigQuery,  Clickhouse,  Dremio,  DuckDB,  Firebolt,  AWS Athena,  Azure Synapse,  BigQuery,  Clickhouse,  Dremio,  Presto,  Trino,"}]}
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Data Tech Compare","text":"<p>Compare all data related technologies with each other to find the best fit for you and your use case.</p>"},{"location":"#categories","title":"Categories","text":"<ul> <li> <p> Files</p> <p>CSV, Parquet, ORC, JSON, Avro, etc.</p> </li> </ul>"},{"location":"database/","title":"Databases","text":""},{"location":"file/","title":"File","text":"<p> Apache Avro  Apache Hudi  Apache Iceberg  Apache ORC  Apache Parquet  CSV  Delta Lake</p> Attribute Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Name Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Description Apache Avro is the leading serialization format for record data, and first choice for streaming data pipelines. Apache Hudi is a transactional data lake platform that brings database and data warehouse capabilities to the data lake. Utilises data stored in either parquet or orc. Iceberg is a high-performance format for huge analytic tables. Utilises data stored in either parquet, avro, or orc. ORC is a self-describing type-aware columnar file format designed for Hadoop workloads. Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. Comma-Separated Values (CSV) is a text file format that uses commas to separate values in plain text. Delta Lake is an open-source storage framework that enables building a Lakehouse architecture. License Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 N/A Apache license 2.0 Source code https://github.com/apache/avro https://github.com/apache/hudi https://github.com/apache/iceberg https://github.com/apache/orc https://github.com/apache/parquet-format https://github.com/delta-io/delta Website https://avro.apache.org/ https://hudi.apache.org/ https://iceberg.apache.org/ https://orc.apache.org/ https://parquet.apache.org/ https://www.rfc-editor.org/rfc/rfc4180.html https://delta.io/ Year created 2009 2016 2017 2013 2013 0 2019 Company Apache Uber Netflix Hortonworks, Facebook Twitter, Cloudera Databricks Language support java, c++, c#, c, python, javascript, perl, ruby, php, rust java, scala, c++, python java, scala, c++, python, r, php java, scala, c++, python, r, php, go scala, java, python, rust Use cases Stream processing, Analytics, Efficient data exchange Incremental data processing, Data upserts, Change Data Capture (CDC), ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, Column based queries Write once read many, Analytics, Efficient storage, ACID transactions Is human readable no no no no no yes no Orientation row column or row column or row row column row column Has type system yes yes yes yes yes no yes Has nested structure support yes yes yes yes yes no yes Has native compression yes yes yes yes yes no yes Has encoding support yes yes yes yes yes no yes Has constraint support no yes no no no no yes Has acid support no yes yes no no no yes Has metadata yes yes yes yes yes no yes Has encryption support no maybe maybe yes yes no maybe Data processing framework support Apache Flink,  Apache Gobblin,  Apache NiFi,  Apache Pig,  Apache Spark,  Apache Spark,  Apache Flink,  Apache Drill,  Apache Flink,  Apache Gobblin,  Apache Pig,  Apache Spark,  Apache Flink,  Apache Gobblin,  Apache Hadoop,  Apache NiFi,  Apache Pig,  Apache Spark,  Apache Beam,  Apache Drill,  Apache Flink,  Apache Spark,  Apache Beam,  Apache Drill,  Apache Flink,  Apache Gobblin,  Apache Hive,  Apache NiFi,  Apache Pig,  Apache Spark,  Apache Drill,  Apache Flink,  Apache Spark,  Analytics query support Apache Impala,  Apache Druid,  Apache Hive,  Apache Pinot,  AWS Athena,  BigQuery,  Clickhouse,  Firebolt,  Apache Hive,  Apache Impala,  AWS Athena,  BigQuery,  Clickhouse,  Presto,  Trino,  Apache Impala,  Apache Druid,  Apache Hive,  AWS Athena,  BigQuery,  Clickhouse,  Dremio,  DuckDB,  Presto,  Trino,  Apache Impala,  Apache Druid,  Apache Hive,  Apache Pinot,  AWS Athena,  BigQuery,  Clickhouse,  Firebolt,  Presto,  Trino,  Apache Hive,  Apache Impala,  Apache Druid,  Apache Pinot,  AWS Athena,  Azure Synapse,  BigQuery,  Clickhouse,  Dremio,  DuckDB,  Firebolt,  Apache Impala,  Apache Druid,  Apache Pinot,  AWS Athena,  Azure Synapse,  BigQuery,  Clickhouse,  Dremio,  DuckDB,  Firebolt,  Apache Hive,  AWS Athena,  Azure Synapse,  BigQuery,  Clickhouse,  Dremio,  Presto,  Trino,"}]}
diff --git a/site/sitemap.xml b/site/sitemap.xml
@@ -2,17 +2,17 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
     <url>
          <loc>https://tech-diff.com/</loc>
-         <lastmod>2023-11-12</lastmod>
+         <lastmod>2023-11-14</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://tech-diff.com/database/</loc>
-         <lastmod>2023-11-12</lastmod>
+         <lastmod>2023-11-14</lastmod>
          <changefreq>daily</changefreq>
     </url>
     <url>
          <loc>https://tech-diff.com/file/</loc>
-         <lastmod>2023-11-12</lastmod>
+         <lastmod>2023-11-14</lastmod>
          <changefreq>daily</changefreq>
     </url>
 </urlset>
diff --git a/site/sitemap.xml.gz b/site/sitemap.xml.gz
diff --git a/tech/file/apache-avro.yaml b/tech/file/apache-avro.yaml
@@ -5,7 +5,8 @@ license: Apache license 2.0
 source_code: https://github.com/apache/avro
 website: https://avro.apache.org/
 year_created: 2009
-company: []
+company:
+  - Apache
 language_support:
   - java
   - c++

diff --git a/tech/file/apache-iceberg.yaml b/tech/file/apache-iceberg.yaml
@@ -46,11 +46,21 @@ file:
   data_processing_framework_support:
     - value: Apache Drill
       source: https://drill.apache.org/docs/iceberg-format-plugin/
+    - value: Apache Flink
+      source: https://iceberg.apache.org/contribute/
+    - value: Apache Gobblin
+      source: https://github.com/apache/gobblin/tree/master/gobblin-iceberg
+    - value: Apache Pig
+      source: https://iceberg.apache.org/contribute/
+    - value: Apache Spark
+      source: https://iceberg.apache.org/spark-quickstart/
   analytics_query_support:
     - value: Apache Impala
       source: https://impala.apache.org/docs/build/html/topics/impala_iceberg.html
     - value: Apache Druid
       source: https://github.com/apache/druid/blob/3a3d37ef40596b6540b6d30ac82a20766335627b/docs/development/extensions-contrib/iceberg.md#L2
+    - value: Apache Hive
+      source: https://iceberg.apache.org/contribute/
     - value: AWS Athena
       source: https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html
     - value: BigQuery

diff --git a/tech/file/delta-lake.yaml b/tech/file/delta-lake.yaml
@@ -20,16 +20,22 @@ use_cases:
 file:
   is_human_readable:
     value: "no"
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-types
   orientation:
     value: column
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-types
   has_type_system:
     value: "yes"
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format
   has_nested_structure_support:
     value: "yes"
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format
   has_native_compression:
     value: "yes"
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-types
   has_encoding_support:
     value: "yes"
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-types
   has_constraint_support:
     value: "yes"
     source: https://docs.delta.io/latest/delta-constraints.html
@@ -38,14 +44,21 @@ file:
     source: https://docs.delta.io/latest/concurrency-control.html
   has_metadata:
     value: "yes"
+    source: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#actions
   has_encryption_support:
     value: "maybe"
     source: https://docs.delta.io/latest/delta-faq.html#what-format-does-delta-lake-use-to-store-data
     notes: "Implicitly supported by data stored as Parquet but no direct API to support encryption"
   data_processing_framework_support:
     - value: Apache Drill
       source: https://drill.apache.org/docs/delta-lake-format-plugin/
+    - value: Apache Flink
+      source: https://github.com/delta-io/delta/tree/master/connectors/flink/
+    - value: Apache Spark
+      source: https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake
   analytics_query_support:
+    - value: Apache Hive
+      source: https://github.com/delta-io/delta/tree/master/connectors/hive
     - value: AWS Athena
       source: https://docs.aws.amazon.com/athena/latest/ug/delta-lake-tables.html
     - value: Azure Synapse
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Data Tech Compare","text":"<p>Compare all data related technologies with each other to find the best fit for you and your use case.</p>"},{"location":"#categories","title":"Categories","text":"<ul> <li> <p> Files</p> <p>CSV, Parquet, ORC, JSON, Avro, etc.</p> </li> </ul>"},{"location":"database/","title":"Databases","text":""},{"location":"file/","title":"File","text":"<p> Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake</p> Attribute Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Name Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Description Apache Avro is the leading serialization format for record data, and first choice for streaming data pipelines. Apache Hudi is a transactional data lake platform that brings database and data warehouse capabilities to the data lake. Utilises data stored in either parquet or orc. Iceberg is a high-performance format for huge analytic tables. Utilises data stored in either parquet, avro, or orc. ORC is a self-describing type-aware columnar file format designed for Hadoop workloads. Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. Comma-Separated Values (CSV) is a text file format that uses commas to separate values in plain text. Delta Lake is an open-source storage framework that enables building a Lakehouse architecture. License Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 N/A Apache license 2.0 Source code https://github.com/apache/avro https://github.com/apache/hudi https://github.com/apache/iceberg https://github.com/apache/orc https://github.com/apache/parquet-format https://github.com/delta-io/delta Website https://avro.apache.org/ https://hudi.apache.org/ https://iceberg.apache.org/ https://orc.apache.org/ https://parquet.apache.org/ https://www.rfc-editor.org/rfc/rfc4180.html https://delta.io/ Year created 2009 2016 2017 2013 2013 0 2019 Company Uber Netflix Hortonworks, Facebook Twitter, Cloudera Databricks Language support java, c++, c#, c, python, javascript, perl, ruby, php, rust java, scala, c++, python java, scala, c++, python, r, php java, scala, c++, python, r, php, go scala, java, python, rust Use cases Stream processing, Analytics, Efficient data exchange Incremental data processing, Data upserts, Change Data Capture (CDC), ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, Column based queries Write once read many, Analytics, Efficient storage, ACID transactions Is human readable Orientation row column or row column or row row column row column Has type system Has nested structure support Has native compression Has encoding support Has constraint support Has acid support Has metadata Has encryption support Data processing framework support Apache Flink, Apache Gobblin, Apache NiFi, Apache Pig, Apache Spark, Apache Spark, Apache Flink, Apache Drill, Apache Flink, Apache Gobblin, Apache Hadoop, Apache NiFi, Apache Pig, Apache Spark, Apache Beam, Apache Drill, Apache Flink, Apache Spark, Apache Beam, Apache Drill, Apache Flink, Apache Gobblin, Apache Hive, Apache NiFi, Apache Pig, Apache Spark, Apache Drill, Analytics query support Apache Impala, Apache Druid, Apache Hive, Apache Pinot, AWS Athena, BigQuery, Clickhouse, Firebolt, Apache Hive, Apache Impala, AWS Athena, BigQuery, Clickhouse, Presto, Trino, Apache Impala, Apache Druid, AWS Athena, BigQuery, Clickhouse, Dremio, DuckDB, Presto, Trino, Apache Impala, Apache Druid, Apache Hive, Apache Pinot, AWS Athena, BigQuery, Clickhouse, Firebolt, Presto, Trino, Apache Hive, Apache Impala, Apache Druid, Apache Pinot, AWS Athena, Azure Synapse, BigQuery, Clickhouse, Dremio, DuckDB, Firebolt, Apache Impala, Apache Druid, Apache Pinot, AWS Athena, Azure Synapse, BigQuery, Clickhouse, Dremio, DuckDB, Firebolt, AWS Athena, Azure Synapse, BigQuery, Clickhouse, Dremio, Presto, Trino,"}]}
		{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Data Tech Compare","text":"<p>Compare all data related technologies with each other to find the best fit for you and your use case.</p>"},{"location":"#categories","title":"Categories","text":"<ul> <li> <p> Files</p> <p>CSV, Parquet, ORC, JSON, Avro, etc.</p> </li> </ul>"},{"location":"database/","title":"Databases","text":""},{"location":"file/","title":"File","text":"<p> Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake</p> Attribute Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Name Apache Avro Apache Hudi Apache Iceberg Apache ORC Apache Parquet CSV Delta Lake Description Apache Avro is the leading serialization format for record data, and first choice for streaming data pipelines. Apache Hudi is a transactional data lake platform that brings database and data warehouse capabilities to the data lake. Utilises data stored in either parquet or orc. Iceberg is a high-performance format for huge analytic tables. Utilises data stored in either parquet, avro, or orc. ORC is a self-describing type-aware columnar file format designed for Hadoop workloads. Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. Comma-Separated Values (CSV) is a text file format that uses commas to separate values in plain text. Delta Lake is an open-source storage framework that enables building a Lakehouse architecture. License Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 Apache license 2.0 N/A Apache license 2.0 Source code https://github.com/apache/avro https://github.com/apache/hudi https://github.com/apache/iceberg https://github.com/apache/orc https://github.com/apache/parquet-format https://github.com/delta-io/delta Website https://avro.apache.org/ https://hudi.apache.org/ https://iceberg.apache.org/ https://orc.apache.org/ https://parquet.apache.org/ https://www.rfc-editor.org/rfc/rfc4180.html https://delta.io/ Year created 2009 2016 2017 2013 2013 0 2019 Company Apache Uber Netflix Hortonworks, Facebook Twitter, Cloudera Databricks Language support java, c++, c#, c, python, javascript, perl, ruby, php, rust java, scala, c++, python java, scala, c++, python, r, php java, scala, c++, python, r, php, go scala, java, python, rust Use cases Stream processing, Analytics, Efficient data exchange Incremental data processing, Data upserts, Change Data Capture (CDC), ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, ACID transactions Write once read many, Analytics, Efficient storage, Column based queries Write once read many, Analytics, Efficient storage, ACID transactions Is human readable no no no no no yes no Orientation row column or row column or row row column row column Has type system yes yes yes yes yes no yes Has nested structure support yes yes yes yes yes no yes Has native compression yes yes yes yes yes no yes Has encoding support yes yes yes yes yes no yes Has constraint support no yes no no no no yes Has acid support no yes yes no no no yes Has metadata yes yes yes yes yes no yes Has encryption support no maybe maybe yes yes no maybe Data processing framework support Apache Flink, Apache Gobblin, Apache NiFi, Apache Pig, Apache Spark, Apache Spark, Apache Flink, Apache Drill, Apache Flink, Apache Gobblin, Apache Pig, Apache Spark, Apache Flink, Apache Gobblin, Apache Hadoop, Apache NiFi, Apache Pig, Apache Spark, Apache Beam, Apache Drill, Apache Flink, Apache Spark, Apache Beam, Apache Drill, Apache Flink, Apache Gobblin, Apache Hive, Apache NiFi, Apache Pig, Apache Spark, Apache Drill, Apache Flink, Apache Spark, Analytics query support Apache Impala, Apache Druid, Apache Hive, Apache Pinot, AWS Athena, BigQuery, Clickhouse, Firebolt, Apache Hive, Apache Impala, AWS Athena, BigQuery, Clickhouse, Presto, Trino, Apache Impala, Apache Druid, Apache Hive, AWS Athena, BigQuery, Clickhouse, Dremio, DuckDB, Presto, Trino, Apache Impala, Apache Druid, Apache Hive, Apache Pinot, AWS Athena, BigQuery, Clickhouse, Firebolt, Presto, Trino, Apache Hive, Apache Impala, Apache Druid, Apache Pinot, AWS Athena, Azure Synapse, BigQuery, Clickhouse, Dremio, DuckDB, Firebolt, Apache Impala, Apache Druid, Apache Pinot, AWS Athena, Azure Synapse, BigQuery, Clickhouse, Dremio, DuckDB, Firebolt, Apache Hive, AWS Athena, Azure Synapse, BigQuery, Clickhouse, Dremio, Presto, Trino,"}]}