use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class SparkTypeVisitor method visit.
static <T> T visit(DataType type, SparkTypeVisitor<T> visitor) {
if (type instanceof StructType) {
StructField[] fields = ((StructType) type).fields();
List<T> fieldResults = Lists.newArrayListWithExpectedSize(fields.length);
for (StructField field : fields) {
fieldResults.add(visitor.field(field, visit(field.dataType(), visitor)));
}
return visitor.struct((StructType) type, fieldResults);
} else if (type instanceof MapType) {
return visitor.map((MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor));
} else if (type instanceof ArrayType) {
return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor));
} else if (type instanceof UserDefinedType) {
throw new UnsupportedOperationException("User-defined types are not supported");
} else {
return visitor.atomic(type);
}
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class PruneColumnsWithoutReordering method field.
@Override
public Type field(Types.NestedField field, Supplier<Type> fieldResult) {
Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
StructType requestedStruct = (StructType) current;
// fields are resolved by name because Spark only sees the current table schema.
if (requestedStruct.getFieldIndex(field.name()).isEmpty()) {
// make sure that filter fields are projected even if they aren't in the requested schema.
if (filterRefs.contains(field.fieldId())) {
return field.type();
}
return null;
}
int fieldIndex = requestedStruct.fieldIndex(field.name());
StructField requestedField = requestedStruct.fields()[fieldIndex];
Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), "Cannot project an optional field as non-null: %s", field.name());
this.current = requestedField.dataType();
try {
return fieldResult.get();
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Invalid projection for field " + field.name() + ": " + e.getMessage(), e);
} finally {
this.current = requestedStruct;
}
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class PruneColumnsWithoutReordering method struct.
@Override
public Type struct(Types.StructType struct, Iterable<Type> fieldResults) {
Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema.");
Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
List<Types.NestedField> fields = struct.fields();
List<Type> types = Lists.newArrayList(fieldResults);
boolean changed = false;
List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(types.size());
for (int i = 0; i < fields.size(); i += 1) {
Types.NestedField field = fields.get(i);
Type type = types.get(i);
if (type == null) {
changed = true;
} else if (field.type() == type) {
newFields.add(field);
} else if (field.isOptional()) {
changed = true;
newFields.add(Types.NestedField.optional(field.fieldId(), field.name(), type));
} else {
changed = true;
newFields.add(Types.NestedField.required(field.fieldId(), field.name(), type));
}
}
if (changed) {
return Types.StructType.of(newFields);
}
return struct;
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class SparkSchemaUtil method schemaForTable.
/**
* Returns a {@link Schema} for the given table with fresh field ids.
* <p>
* This creates a Schema for an existing table by looking up the table's schema with Spark and
* converting that schema. Spark/Hive partition columns are included in the schema.
*
* @param spark a Spark session
* @param name a table name and (optional) database
* @return a Schema for the table, if found
*/
public static Schema schemaForTable(SparkSession spark, String name) {
StructType sparkType = spark.table(name).schema();
Type converted = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType));
return new Schema(converted.asNestedType().asStructType().fields());
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class WritersBenchmark method writePartitionedClusteredDataWriter.
@Benchmark
@Threads(1)
public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOException {
FileIO io = table().io();
OutputFileFactory fileFactory = newFileFactory();
SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).dataSchema(table().schema()).build();
ClusteredDataWriter<InternalRow> writer = new ClusteredDataWriter<>(writerFactory, fileFactory, io, fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema());
StructType dataSparkType = SparkSchemaUtil.convert(table().schema());
InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType);
try (ClusteredDataWriter<InternalRow> closeableWriter = writer) {
for (InternalRow row : rows) {
partitionKey.partition(internalRowWrapper.wrap(row));
closeableWriter.write(row, partitionedSpec, partitionKey);
}
}
blackhole.consume(writer);
}
Aggregations