use of org.apache.spark.sql.execution.datasources.InMemoryFileIndex in project iceberg by apache.
the class Spark3Util method getPartitions.
/**
* Use Spark to list all partitions in the table.
*
* @param spark a Spark session
* @param rootPath a table identifier
* @param format format of the file
* @return all table's partitions
*/
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
Map<String, String> emptyMap = Collections.emptyMap();
InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
StructType schema = spec.partitionColumns();
return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
Map<String, String> values = Maps.newHashMap();
JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
int fieldIndex = schema.fieldIndex(field.name());
Object catalystValue = partition.values().get(fieldIndex, field.dataType());
Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
values.put(field.name(), String.valueOf(value));
});
return new SparkPartition(values, partition.path().toString(), format);
}).collect(Collectors.toList());
}
use of org.apache.spark.sql.execution.datasources.InMemoryFileIndex in project iceberg by apache.
the class Spark3Util method getPartitions.
/**
* Use Spark to list all partitions in the table.
*
* @param spark a Spark session
* @param rootPath a table identifier
* @param format format of the file
* @param partitionFilter partitionFilter of the file
* @return all table's partitions
*/
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format, Map<String, String> partitionFilter) {
FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), scala.collection.immutable.Map$.MODULE$.<String, String>empty(), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
StructType schema = spec.partitionColumns();
if (schema.isEmpty()) {
return Lists.newArrayList();
}
List<org.apache.spark.sql.catalyst.expressions.Expression> filterExpressions = SparkUtil.partitionMapToExpression(schema, partitionFilter);
Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaPartitionFilters = JavaConverters.asScalaBufferConverter(filterExpressions).asScala().toIndexedSeq();
List<org.apache.spark.sql.catalyst.expressions.Expression> dataFilters = Lists.newArrayList();
Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaDataFilters = JavaConverters.asScalaBufferConverter(dataFilters).asScala().toIndexedSeq();
Seq<PartitionDirectory> filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters).toIndexedSeq();
return JavaConverters.seqAsJavaListConverter(filteredPartitions).asJava().stream().map(partition -> {
Map<String, String> values = Maps.newHashMap();
JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
int fieldIndex = schema.fieldIndex(field.name());
Object catalystValue = partition.values().get(fieldIndex, field.dataType());
Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
values.put(field.name(), String.valueOf(value));
});
FileStatus fileStatus = JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0);
return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format);
}).collect(Collectors.toList());
}
Aggregations