Skip to content

Commit

Permalink
[TINKERPOP-3133] Allow customize the output partition
Browse files Browse the repository at this point in the history
set the repartition number by, for example, 'gremlin.spark.outputRepartition=500'.
only integer values larger than 0 is valid input. Otherwise the
repartition will be skipped silently.
  • Loading branch information
Hongjiang Zhang committed Feb 12, 2025
1 parent bdba436 commit 41edcc0
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ private Constants() {
public static final String GREMLIN_SPARK_SKIP_PARTITIONER = "gremlin.spark.skipPartitioner"; // don't partition the loadedGraphRDD
public static final String GREMLIN_SPARK_SKIP_GRAPH_CACHE = "gremlin.spark.skipGraphCache"; // don't cache the loadedGraphRDD (ignores graphStorageLevel)
public static final String GREMLIN_SPARK_DONT_DELETE_NON_EMPTY_OUTPUT = "gremlin.spark.dontDeleteNonEmptyOutput"; // don't delete the output if it is not empty
public static final String GREMLIN_SPARK_OUTPUT_REPARTITION = "gremlin.spark.outputRepartition"; // allow set the repartition number of the outputRDD to reduce HDFS small files
public static final String SPARK_SERIALIZER = "spark.serializer";
public static final String SPARK_KRYO_REGISTRATOR = "spark.kryo.registrator";
public static final String SPARK_KRYO_REGISTRATION_REQUIRED = "spark.kryo.registrationRequired";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<O
final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
if (null != outputLocation) {
// map back to a <nullwritable,vertexwritable> stream for output
graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
JavaPairRDD<Object, VertexWritable> javaPairRDD = repartitionJavaPairRDD(hadoopConfiguration, graphRDD);
javaPairRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
.saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation),
NullWritable.class,
VertexWritable.class,
Expand All @@ -62,7 +63,8 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
if (null != outputLocation) {
// map back to a Hadoop stream for output
memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
JavaPairRDD<K, V> javaPairRDD = repartitionJavaPairRDD(hadoopConfiguration, memoryRDD);
javaPairRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
.saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey),
ObjectWritable.class,
ObjectWritable.class,
Expand All @@ -75,4 +77,17 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
}
return Collections.emptyIterator();
}
}

/**
* Allow users to customize the RDD partitions to reduce HDFS small files
*/
private static <K, V> JavaPairRDD<K, V> repartitionJavaPairRDD(final org.apache.hadoop.conf.Configuration hadoopConfiguration, JavaPairRDD<K, V> graphRDD) {
JavaPairRDD<K, V> javaPairRDD = graphRDD;
final String repartitionString = hadoopConfiguration.get(Constants.GREMLIN_SPARK_OUTPUT_REPARTITION);
final int repartition = null == repartitionString ? -1 : Integer.parseInt(repartitionString);
if (repartition > 0) {
javaPairRDD = javaPairRDD.repartition(repartition);
}
return javaPairRDD;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,16 @@ public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<O
SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)); // this might be bad cause it unpersists the job RDD
// determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
final JavaPairRDD<Object, VertexWritable> javaPairRDD = repartitionJavaPairRDD(configuration, graphRDD);
if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
graphRDD.mapValues(vertex -> {
javaPairRDD.mapValues(vertex -> {
vertex.get().dropEdges(Direction.BOTH);
return vertex;
}).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
// call action to eager store rdd
.count();
else
graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
javaPairRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
// call action to eager store rdd
.count();
Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
Expand All @@ -73,15 +74,29 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
final String memoryRDDName = Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), memoryKey);
Spark.removeRDD(memoryRDDName);
memoryRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")))
final JavaPairRDD<K, V> javaPairRDD = repartitionJavaPairRDD(configuration, memoryRDD);
javaPairRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")))
// call action to eager store rdd
.count();
Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
return IteratorUtils.map(memoryRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2()));
return IteratorUtils.map(javaPairRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2()));
}

@Override
public boolean supportsResultGraphPersistCombination(final GraphComputer.ResultGraph resultGraph, final GraphComputer.Persist persist) {
return persist.equals(GraphComputer.Persist.NOTHING) || resultGraph.equals(GraphComputer.ResultGraph.NEW);
}

/**
* Allow users to customize the RDD partitions to reduce HDFS small files
*/
private static <K, V> JavaPairRDD<K, V> repartitionJavaPairRDD(final Configuration configuration, JavaPairRDD<K, V> graphRDD) {
JavaPairRDD<K, V> javaPairRDD = graphRDD;
final String repartitionString = configuration.getString(Constants.GREMLIN_SPARK_OUTPUT_REPARTITION);
final int repartition = null == repartitionString ? -1 : Integer.parseInt(repartitionString);
if (repartition > 0) {
javaPairRDD = javaPairRDD.repartition(repartition);
}
return javaPairRDD;
}
}

0 comments on commit 41edcc0

Please sign in to comment.