Skip to content

Commit 04b0d09

Browse files
fietenoerlorenzkautzschAditya Pandey
authored andcommitted
[SYSTEMDS-3650] New I/O Support Cloud-optimized GeoTIFF
DIA WiSe 24/25 project Closes #2195. Co-authored-by: lorenzkautzsch <[email protected]> Co-authored-by: Aditya Pandey <[email protected]>
1 parent c745550 commit 04b0d09

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+2398
-11
lines changed

docs/site/dml-language-reference.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -868,13 +868,15 @@ that users provide MTD files for their own data as well.
868868

869869
#### File formats and MTD files
870870

871-
SystemDS supports 4 file formats:
871+
SystemDS supports 6 file formats:
872872

873873
* CSV (delimited)
874874
* Matrix Market (coordinate)
875875
* Text (i,j,v)
876+
* LIBSVM
876877
* Binary
877878
* HDF5
879+
* COG
878880

879881
The CSV format is a standard text-based format where columns are separated by delimiter characters, typically commas, and
880882
rows are represented on separate lines.
@@ -899,6 +901,15 @@ Hierarchical Data Format (HDF) is a file format designed to store and organize l
899901
some features of the HDF5 like two dimension data (Matrix), matrix with FP64 (double) data type,
900902
single dataset, single group, and contiguous dataset.
901903

904+
Cloud Optimized GeoTIFF (COG) is an image format designed to store large amounts of geospatial data while allowing for
905+
efficient access. This is done by splitting the image into tiles which can then be accessed independently. Currently, SystemDS
906+
only supports reading COG files and can only process the most important metadata that is required for reading the image. Normal
907+
TIFF files that aren't tiled cannot be read by SystemDS. Support for BigTIFF is very limited and not recommended.
908+
909+
The currently supported compression methods are as follows:
910+
- None
911+
- Deflate
912+
902913
Let's look at a matrix and examples of its data represented in the supported formats with corresponding metadata. In the table below, we have
903914
a matrix consisting of 4 rows and 3 columns.
904915

src/main/java/org/apache/sysds/common/Types.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -863,6 +863,7 @@ public enum FileFormat {
863863
FEDERATED, // A federated matrix
864864
PROTO, // protocol buffer representation
865865
HDF5, // Hierarchical Data Format (HDF)
866+
COG, // Cloud-optimized GeoTIFF
866867
UNKNOWN;
867868

868869
public boolean isIJV() {

src/main/java/org/apache/sysds/lops/compile/Dag.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import org.apache.commons.logging.Log;
3131
import org.apache.commons.logging.LogFactory;
3232
import org.apache.sysds.api.DMLScript;
33-
import org.apache.sysds.common.Opcodes;
3433
import org.apache.sysds.common.Types.DataType;
3534
import org.apache.sysds.common.Types.ExecType;
3635
import org.apache.sysds.common.Types.FileFormat;

src/main/java/org/apache/sysds/parser/DMLTranslator.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
import org.apache.commons.logging.LogFactory;
3333
import org.apache.sysds.api.DMLScript;
3434
import org.apache.sysds.common.Builtins;
35-
import org.apache.sysds.common.Opcodes;
3635
import org.apache.sysds.common.Types.AggOp;
3736
import org.apache.sysds.common.Types.DataType;
3837
import org.apache.sysds.common.Types.Direction;

src/main/java/org/apache/sysds/parser/DataExpression.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,8 @@ else if( getVarParam(READNNZPARAM) != null ) {
11641164

11651165
boolean isHDF5 = (formatTypeString != null && formatTypeString.equalsIgnoreCase(FileFormat.HDF5.toString()));
11661166

1167+
boolean isCOG = (formatTypeString != null && formatTypeString.equalsIgnoreCase(FileFormat.COG.toString()));
1168+
11671169
dataTypeString = (getVarParam(DATATYPEPARAM) == null) ? null : getVarParam(DATATYPEPARAM).toString();
11681170

11691171
if ( dataTypeString == null || dataTypeString.equalsIgnoreCase(Statement.MATRIX_DATA_TYPE)
@@ -1188,7 +1190,7 @@ else if( getVarParam(READNNZPARAM) != null ) {
11881190
// initialize size of target data identifier to UNKNOWN
11891191
getOutput().setDimensions(-1, -1);
11901192

1191-
if (!isCSV && !isLIBSVM && !isHDF5 && ConfigurationManager.getCompilerConfig()
1193+
if (!isCSV && !isLIBSVM && !isHDF5 && !isCOG && ConfigurationManager.getCompilerConfig()
11921194
.getBool(ConfigType.REJECT_READ_WRITE_UNKNOWNS) //skip check for csv/libsvm format / jmlc api
11931195
&& (getVarParam(READROWPARAM) == null || getVarParam(READCOLPARAM) == null) ) {
11941196
raiseValidateError("Missing or incomplete dimension information in read statement: "

src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/ACachingMBDictionary.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
public abstract class ACachingMBDictionary extends ADictionary {
2525

26+
private static final long serialVersionUID = 7035552219254994595L;
2627
/** A Cache to contain a materialized version of the identity matrix. */
2728
protected volatile SoftReference<MatrixBlockDictionary> cache = null;
2829

src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/AIdentityDictionary.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.sysds.runtime.compress.DMLCompressionException;
2323

2424
public abstract class AIdentityDictionary extends ACachingMBDictionary {
25+
private static final long serialVersionUID = 5013713435287705877L;
2526
/** The number of rows or columns, rows can be +1 if withEmpty is set. */
2627
protected final int nRowCol;
2728
/** Specify if the Identity matrix should contain an empty row in the end. */

src/main/java/org/apache/sysds/runtime/functionobjects/ParameterizedBuiltin.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import org.apache.commons.math3.distribution.FDistribution;
2828
import org.apache.commons.math3.distribution.NormalDistribution;
2929
import org.apache.commons.math3.distribution.TDistribution;
30-
import org.apache.sysds.common.Opcodes;
3130
import org.apache.sysds.runtime.DMLRuntimeException;
3231
import org.apache.sysds.runtime.util.UtilFunctions;
3332

src/main/java/org/apache/sysds/runtime/instructions/gpu/BuiltinBinaryGPUInstruction.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
package org.apache.sysds.runtime.instructions.gpu;
2121

22-
import org.apache.sysds.common.Opcodes;
2322
import org.apache.sysds.common.Types.DataType;
2423
import org.apache.sysds.common.Types.ValueType;
2524
import org.apache.sysds.runtime.DMLRuntimeException;
@@ -77,5 +76,4 @@ else if (isMatrixScalar && (opcode.equals("min") || opcode.equals("max")))
7776
throw new DMLRuntimeException(
7877
"GPU : Unsupported GPU builtin operations on a matrix and a scalar:" + opcode);
7978
}
80-
8179
}

src/main/java/org/apache/sysds/runtime/instructions/gpu/MatrixBuiltinGPUInstruction.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121

2222
import org.apache.commons.logging.Log;
2323
import org.apache.commons.logging.LogFactory;
24-
import org.apache.sysds.common.Opcodes;
2524
import org.apache.sysds.runtime.DMLRuntimeException;
2625
import org.apache.sysds.runtime.controlprogram.caching.MatrixObject;
2726
import org.apache.sysds.runtime.controlprogram.context.ExecutionContext;
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.sysds.runtime.io;
21+
22+
import org.apache.commons.logging.Log;
23+
import org.apache.commons.logging.LogFactory;
24+
25+
import java.io.Serializable;
26+
27+
public class FileFormatPropertiesCOG extends FileFormatProperties implements Serializable {
28+
protected static final Log LOG = LogFactory.getLog(FileFormatPropertiesCOG.class.getName());
29+
private static final long serialVersionUID = 1038419221722594985L;
30+
31+
private String datasetName;
32+
33+
public FileFormatPropertiesCOG() {
34+
this.datasetName = "systemdscog";
35+
}
36+
37+
public FileFormatPropertiesCOG(String datasetName) {
38+
this.datasetName = datasetName;
39+
}
40+
41+
public String getDatasetName() {
42+
return datasetName;
43+
}
44+
45+
@Override public String toString() {
46+
StringBuilder sb = new StringBuilder();
47+
sb.append(" datasetName " + datasetName);
48+
return sb.toString();
49+
}
50+
}

src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,15 @@ public static MatrixReader createMatrixReader(FileFormat fmt) {
6464
break;
6565

6666
case HDF5:
67-
reader = (par & mcsr) ? new ReaderHDF5Parallel(
68-
new FileFormatPropertiesHDF5()) : new ReaderHDF5(new FileFormatPropertiesHDF5());
67+
reader = (par & mcsr) ?
68+
new ReaderHDF5Parallel(new FileFormatPropertiesHDF5()) :
69+
new ReaderHDF5(new FileFormatPropertiesHDF5());
70+
break;
71+
72+
case COG:
73+
reader = (par & mcsr) ?
74+
new ReaderCOGParallel(new FileFormatPropertiesCOG()) :
75+
new ReaderCOG(new FileFormatPropertiesCOG());
6976
break;
7077

7178
case COMPRESSED:
@@ -124,6 +131,12 @@ public static MatrixReader createMatrixReader( ReadProperties props ) {
124131
fileFormatPropertiesHDF5);
125132
break;
126133

134+
case COG:
135+
FileFormatPropertiesCOG fileFormatPropertiesCOG = props.formatProperties != null ? (FileFormatPropertiesCOG) props.formatProperties : new FileFormatPropertiesCOG();
136+
reader = (par & mcsr) ?
137+
new ReaderCOGParallel(fileFormatPropertiesCOG) : new ReaderCOG(fileFormatPropertiesCOG);
138+
break;
139+
127140
case COMPRESSED:
128141
reader = new ReaderCompressed();
129142
break;

0 commit comments

Comments
 (0)