use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class WritersBenchmark method writeUnpartitionedLegacyDataWriter.
@Benchmark
@Threads(1)
public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOException {
FileIO io = table().io();
OutputFileFactory fileFactory = newFileFactory();
Schema writeSchema = table().schema();
StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema);
SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType).spec(unpartitionedSpec).build();
TaskWriter<InternalRow> writer = new UnpartitionedWriter<>(unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES);
try (TaskWriter<InternalRow> closableWriter = writer) {
for (InternalRow row : rows) {
closableWriter.write(row);
}
}
blackhole.consume(writer.complete());
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class WritersBenchmark method writePartitionedLegacyFanoutDataWriter.
@Benchmark
@Threads(1)
public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws IOException {
FileIO io = table().io();
OutputFileFactory fileFactory = newFileFactory();
Schema writeSchema = table().schema();
StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema);
SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType).spec(partitionedSpec).build();
TaskWriter<InternalRow> writer = new SparkPartitionedFanoutWriter(partitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, writeSchema, sparkWriteType);
try (TaskWriter<InternalRow> closableWriter = writer) {
for (InternalRow row : rows) {
closableWriter.write(row);
}
}
blackhole.consume(writer.complete());
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class WritersBenchmark method writePartitionedFanoutDataWriter.
@Benchmark
@Threads(1)
public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOException {
FileIO io = table().io();
OutputFileFactory fileFactory = newFileFactory();
SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).dataSchema(table().schema()).build();
FanoutDataWriter<InternalRow> writer = new FanoutDataWriter<>(writerFactory, fileFactory, io, fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema());
StructType dataSparkType = SparkSchemaUtil.convert(table().schema());
InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType);
try (FanoutDataWriter<InternalRow> closeableWriter = writer) {
for (InternalRow row : rows) {
partitionKey.partition(internalRowWrapper.wrap(row));
closeableWriter.write(row, partitionedSpec, partitionKey);
}
}
blackhole.consume(writer);
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class PruneColumnsWithReordering method struct.
@Override
public Type struct(Types.StructType struct, Iterable<Type> fieldResults) {
Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema.");
Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
StructType requestedStruct = (StructType) current;
List<Types.NestedField> fields = struct.fields();
List<Type> types = Lists.newArrayList(fieldResults);
boolean changed = false;
// use a LinkedHashMap to preserve the original order of filter fields that are not projected
Map<String, Types.NestedField> projectedFields = Maps.newLinkedHashMap();
for (int i = 0; i < fields.size(); i += 1) {
Types.NestedField field = fields.get(i);
Type type = types.get(i);
if (type == null) {
changed = true;
} else if (field.type() == type) {
projectedFields.put(field.name(), field);
} else if (field.isOptional()) {
changed = true;
projectedFields.put(field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type));
} else {
changed = true;
projectedFields.put(field.name(), Types.NestedField.required(field.fieldId(), field.name(), type));
}
}
// Construct a new struct with the projected struct's order
boolean reordered = false;
StructField[] requestedFields = requestedStruct.fields();
List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(requestedFields.length);
for (int i = 0; i < requestedFields.length; i += 1) {
// fields are resolved by name because Spark only sees the current table schema.
String name = requestedFields[i].name();
if (!fields.get(i).name().equals(name)) {
reordered = true;
}
newFields.add(projectedFields.remove(name));
}
// Add remaining filter fields that were not explicitly projected
if (!projectedFields.isEmpty()) {
newFields.addAll(projectedFields.values());
// order probably changed
changed = true;
}
if (reordered || changed) {
return Types.StructType.of(newFields);
}
return struct;
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class PruneColumnsWithReordering method field.
@Override
public Type field(Types.NestedField field, Supplier<Type> fieldResult) {
Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
StructType requestedStruct = (StructType) current;
// fields are resolved by name because Spark only sees the current table schema.
if (requestedStruct.getFieldIndex(field.name()).isEmpty()) {
// make sure that filter fields are projected even if they aren't in the requested schema.
if (filterRefs.contains(field.fieldId())) {
return field.type();
}
return null;
}
int fieldIndex = requestedStruct.fieldIndex(field.name());
StructField requestedField = requestedStruct.fields()[fieldIndex];
Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), "Cannot project an optional field as non-null: %s", field.name());
this.current = requestedField.dataType();
try {
return fieldResult.get();
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Invalid projection for field " + field.name() + ": " + e.getMessage(), e);
} finally {
this.current = requestedStruct;
}
}
Aggregations