[TINKERPOP-3133] Allow customize the output partition

set the repartition number by, for example, 'gremlin.spark.outputRepartition=500'. only integer values larger than 0 is valid input. Otherwise the repartition will be skipped silently.
apache · Feb 11, 2025 · e3ff08f · e3ff08f
1 parent bdba436
commit e3ff08f
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 3 deletions.
diff --git a/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java b/hadoop-gremlin/src/main/java/org/apache/tinkerpop/gremlin/hadoop/Constants.java
@@ -74,6 +74,7 @@ private Constants() {
     public static final String GREMLIN_SPARK_SKIP_PARTITIONER = "gremlin.spark.skipPartitioner"; // don't partition the loadedGraphRDD
     public static final String GREMLIN_SPARK_SKIP_GRAPH_CACHE = "gremlin.spark.skipGraphCache";  // don't cache the loadedGraphRDD (ignores graphStorageLevel)
     public static final String GREMLIN_SPARK_DONT_DELETE_NON_EMPTY_OUTPUT = "gremlin.spark.dontDeleteNonEmptyOutput"; // don't delete the output if it is not empty
+    public static final String GREMLIN_SPARK_OUTPUT_REPARTITION = "gremlin.spark.outputRepartition"; // allow set the repartition number of the outputRDD to reduce HDFS small files
     public static final String SPARK_SERIALIZER = "spark.serializer";
     public static final String SPARK_KRYO_REGISTRATOR = "spark.kryo.registrator";
     public static final String SPARK_KRYO_REGISTRATION_REQUIRED = "spark.kryo.registrationRequired";

diff --git a/...remlin/src/main/java/org/apache/tinkerpop/gremlin/spark/structure/io/OutputFormatRDD.java b/...remlin/src/main/java/org/apache/tinkerpop/gremlin/spark/structure/io/OutputFormatRDD.java
@@ -48,7 +48,8 @@ public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<O
         final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
         if (null != outputLocation) {
             // map back to a <nullwritable,vertexwritable> stream for output
-            graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
+            JavaPairRDD<Object, VertexWritable> javaPairRDD = repartitionJavaPairRDD(hadoopConfiguration, graphRDD);
+            javaPairRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
                     .saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation),
                             NullWritable.class,
                             VertexWritable.class,
@@ -62,7 +63,8 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
         final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
         if (null != outputLocation) {
             // map back to a Hadoop stream for output
-            memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
+            JavaPairRDD<K, V> javaPairRDD = repartitionJavaPairRDD(hadoopConfiguration, memoryRDD);
+            javaPairRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
                     .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey),
                             ObjectWritable.class,
                             ObjectWritable.class,
@@ -75,4 +77,17 @@ public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration config
         }
         return Collections.emptyIterator();
     }
-}
+
+    /**
+     * Allow users to customize the RDD partitions to reduce HDFS small files
+     */
+    private <K, V> JavaPairRDD<K, V> repartitionJavaPairRDD(final org.apache.hadoop.conf.Configuration hadoopConfiguration, JavaPairRDD<K, V> graphRDD) {
+        JavaPairRDD<K, V> javaPairRDD = graphRDD;
+        final String repartitionString = hadoopConfiguration.get(Constants.GREMLIN_SPARK_OUTPUT_REPARTITION);
+        final int repartition = null == repartitionString ? -1 : Integer.parseInt(repartitionString);
+        if (repartition > 0) {
+            javaPairRDD = javaPairRDD.repartition(repartition);
+        }
+        return javaPairRDD;
+    }
+}