use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class IcebergSourceBenchmark method appendAsFile.
protected void appendAsFile(Dataset<Row> ds) {
// ensure the schema is precise (including nullability)
StructType sparkSchema = SparkSchemaUtil.convert(table.schema());
spark.createDataFrame(ds.rdd(), sparkSchema).coalesce(1).write().format("iceberg").mode(SaveMode.Append).save(table.location());
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class BaseRewriteManifestsSparkAction method writeManifestsForUnpartitionedTable.
private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) {
Broadcast<FileIO> io = sparkContext().broadcast(fileIO);
StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();
// we rely only on the target number of manifests for unpartitioned tables
// as we should not worry about having too much metadata per partition
long maxNumManifestEntries = Long.MAX_VALUE;
return manifestEntryDF.repartition(numManifests).mapPartitions(toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), manifestEncoder).collectAsList();
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class BaseRewriteManifestsSparkAction method writeManifestsForPartitionedTable.
private List<ManifestFile> writeManifestsForPartitionedTable(Dataset<Row> manifestEntryDF, int numManifests, int targetNumManifestEntries) {
Broadcast<FileIO> io = sparkContext().broadcast(fileIO);
StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();
// we allow the actual size of manifests to be 10% higher if the estimation is not precise enough
long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries);
return withReusableDS(manifestEntryDF, df -> {
Column partitionColumn = df.col("data_file.partition");
return df.repartitionByRange(numManifests, partitionColumn).sortWithinPartitions(partitionColumn).mapPartitions(toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), manifestEncoder).collectAsList();
});
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class BaseTableCreationSparkAction method stageDestTable.
protected StagedSparkTable stageDestTable() {
try {
Map<String, String> props = destTableProps();
StructType schema = sourceTable.schema();
Transform[] partitioning = sourceTable.partitioning();
return (StagedSparkTable) destCatalog().stageCreate(destTableIdent(), schema, partitioning, props);
} catch (org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException e) {
throw new NoSuchNamespaceException("Cannot create table %s as the namespace does not exist", destTableIdent());
} catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) {
throw new AlreadyExistsException("Cannot create table %s as it already exists", destTableIdent());
}
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class ParquetWithSparkSchemaVisitor method visit.
public static <T> T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor<T> visitor) {
Preconditions.checkArgument(sType != null, "Invalid DataType: null");
if (type instanceof MessageType) {
Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType);
StructType struct = (StructType) sType;
return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor));
} else if (type.isPrimitive()) {
return visitor.primitive(sType, type.asPrimitiveType());
} else {
// if not a primitive, the typeId must be a group
GroupType group = type.asGroupType();
OriginalType annotation = group.getOriginalType();
if (annotation != null) {
switch(annotation) {
case LIST:
Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), "Invalid list: top-level group is repeated: %s", group);
Preconditions.checkArgument(group.getFieldCount() == 1, "Invalid list: does not contain single repeated field: %s", group);
GroupType repeatedElement = group.getFields().get(0).asGroupType();
Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated");
Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, "Invalid list: repeated group is not a single field: %s", group);
Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType);
ArrayType array = (ArrayType) sType;
StructField element = new StructField("element", array.elementType(), array.containsNull(), Metadata.empty());
visitor.fieldNames.push(repeatedElement.getName());
try {
T elementResult = null;
if (repeatedElement.getFieldCount() > 0) {
elementResult = visitField(element, repeatedElement.getType(0), visitor);
}
return visitor.list(array, group, elementResult);
} finally {
visitor.fieldNames.pop();
}
case MAP:
Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), "Invalid map: top-level group is repeated: %s", group);
Preconditions.checkArgument(group.getFieldCount() == 1, "Invalid map: does not contain single repeated field: %s", group);
GroupType repeatedKeyValue = group.getType(0).asGroupType();
Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated");
Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields");
Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType);
MapType map = (MapType) sType;
StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty());
StructField valueField = new StructField("value", map.valueType(), map.valueContainsNull(), Metadata.empty());
visitor.fieldNames.push(repeatedKeyValue.getName());
try {
T keyResult = null;
T valueResult = null;
switch(repeatedKeyValue.getFieldCount()) {
case 2:
// if there are 2 fields, both key and value are projected
keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor);
valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor);
break;
case 1:
// if there is just one, use the name to determine what it is
Type keyOrValue = repeatedKeyValue.getType(0);
if (keyOrValue.getName().equalsIgnoreCase("key")) {
keyResult = visitField(keyField, keyOrValue, visitor);
// value result remains null
} else {
valueResult = visitField(valueField, keyOrValue, visitor);
// key result remains null
}
break;
default:
}
return visitor.map(map, group, keyResult, valueResult);
} finally {
visitor.fieldNames.pop();
}
default:
}
}
Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType);
StructType struct = (StructType) sType;
return visitor.struct(struct, group, visitFields(struct, group, visitor));
}
}
Aggregations