use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class Spark3Util method getPartitions.
/**
* Use Spark to list all partitions in the table.
*
* @param spark a Spark session
* @param rootPath a table identifier
* @param format format of the file
* @param partitionFilter partitionFilter of the file
* @return all table's partitions
*/
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format, Map<String, String> partitionFilter) {
FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), scala.collection.immutable.Map$.MODULE$.<String, String>empty(), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
StructType schema = spec.partitionColumns();
if (schema.isEmpty()) {
return Lists.newArrayList();
}
List<org.apache.spark.sql.catalyst.expressions.Expression> filterExpressions = SparkUtil.partitionMapToExpression(schema, partitionFilter);
Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaPartitionFilters = JavaConverters.asScalaBufferConverter(filterExpressions).asScala().toIndexedSeq();
List<org.apache.spark.sql.catalyst.expressions.Expression> dataFilters = Lists.newArrayList();
Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaDataFilters = JavaConverters.asScalaBufferConverter(dataFilters).asScala().toIndexedSeq();
Seq<PartitionDirectory> filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters).toIndexedSeq();
return JavaConverters.seqAsJavaListConverter(filteredPartitions).asJava().stream().map(partition -> {
Map<String, String> values = Maps.newHashMap();
JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
int fieldIndex = schema.fieldIndex(field.name());
Object catalystValue = partition.values().get(fieldIndex, field.dataType());
Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
values.put(field.name(), String.valueOf(value));
});
FileStatus fileStatus = JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0);
return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format);
}).collect(Collectors.toList());
}
use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class AddFilesProcedure method importFileTable.
private void importFileTable(Table table, Path tableLocation, String format, Map<String, String> partitionFilter, boolean checkDuplicateFiles) {
// List Partitions via Spark InMemory file search interface
List<SparkPartition> partitions = Spark3Util.getPartitions(spark(), tableLocation, format, partitionFilter);
if (table.spec().isUnpartitioned()) {
Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table");
Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" + "to an unpartitioned table");
// Build a Global Partition for the source
SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format);
importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles);
} else {
Preconditions.checkArgument(!partitions.isEmpty(), "Cannot find any matching partitions in table %s", partitions);
importPartitions(table, partitions, checkDuplicateFiles);
}
}
Aggregations