Search in sources :

Example 21 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class WritersBenchmark method writeUnpartitionedLegacyDataWriter.

@Benchmark
@Threads(1)
public void writeUnpartitionedLegacyDataWriter(Blackhole blackhole) throws IOException {
    FileIO io = table().io();
    OutputFileFactory fileFactory = newFileFactory();
    Schema writeSchema = table().schema();
    StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema);
    SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType).spec(unpartitionedSpec).build();
    TaskWriter<InternalRow> writer = new UnpartitionedWriter<>(unpartitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES);
    try (TaskWriter<InternalRow> closableWriter = writer) {
        for (InternalRow row : rows) {
            closableWriter.write(row);
        }
    }
    blackhole.consume(writer.complete());
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) StructType(org.apache.spark.sql.types.StructType) Schema(org.apache.iceberg.Schema) UnpartitionedWriter(org.apache.iceberg.io.UnpartitionedWriter) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FileIO(org.apache.iceberg.io.FileIO) Threads(org.openjdk.jmh.annotations.Threads) Benchmark(org.openjdk.jmh.annotations.Benchmark)

Example 22 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class WritersBenchmark method writePartitionedLegacyFanoutDataWriter.

@Benchmark
@Threads(1)
public void writePartitionedLegacyFanoutDataWriter(Blackhole blackhole) throws IOException {
    FileIO io = table().io();
    OutputFileFactory fileFactory = newFileFactory();
    Schema writeSchema = table().schema();
    StructType sparkWriteType = SparkSchemaUtil.convert(writeSchema);
    SparkAppenderFactory appenders = SparkAppenderFactory.builderFor(table(), writeSchema, sparkWriteType).spec(partitionedSpec).build();
    TaskWriter<InternalRow> writer = new SparkPartitionedFanoutWriter(partitionedSpec, fileFormat(), appenders, fileFactory, io, TARGET_FILE_SIZE_IN_BYTES, writeSchema, sparkWriteType);
    try (TaskWriter<InternalRow> closableWriter = writer) {
        for (InternalRow row : rows) {
            closableWriter.write(row);
        }
    }
    blackhole.consume(writer.complete());
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) StructType(org.apache.spark.sql.types.StructType) Schema(org.apache.iceberg.Schema) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FileIO(org.apache.iceberg.io.FileIO) Threads(org.openjdk.jmh.annotations.Threads) Benchmark(org.openjdk.jmh.annotations.Benchmark)

Example 23 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class WritersBenchmark method writePartitionedFanoutDataWriter.

@Benchmark
@Threads(1)
public void writePartitionedFanoutDataWriter(Blackhole blackhole) throws IOException {
    FileIO io = table().io();
    OutputFileFactory fileFactory = newFileFactory();
    SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).dataSchema(table().schema()).build();
    FanoutDataWriter<InternalRow> writer = new FanoutDataWriter<>(writerFactory, fileFactory, io, fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
    PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema());
    StructType dataSparkType = SparkSchemaUtil.convert(table().schema());
    InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType);
    try (FanoutDataWriter<InternalRow> closeableWriter = writer) {
        for (InternalRow row : rows) {
            partitionKey.partition(internalRowWrapper.wrap(row));
            closeableWriter.write(row, partitionedSpec, partitionKey);
        }
    }
    blackhole.consume(writer);
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) StructType(org.apache.spark.sql.types.StructType) PartitionKey(org.apache.iceberg.PartitionKey) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FanoutDataWriter(org.apache.iceberg.io.FanoutDataWriter) FileIO(org.apache.iceberg.io.FileIO) Threads(org.openjdk.jmh.annotations.Threads) Benchmark(org.openjdk.jmh.annotations.Benchmark)

Example 24 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class PruneColumnsWithReordering method struct.

@Override
public Type struct(Types.StructType struct, Iterable<Type> fieldResults) {
    Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema.");
    Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
    StructType requestedStruct = (StructType) current;
    List<Types.NestedField> fields = struct.fields();
    List<Type> types = Lists.newArrayList(fieldResults);
    boolean changed = false;
    // use a LinkedHashMap to preserve the original order of filter fields that are not projected
    Map<String, Types.NestedField> projectedFields = Maps.newLinkedHashMap();
    for (int i = 0; i < fields.size(); i += 1) {
        Types.NestedField field = fields.get(i);
        Type type = types.get(i);
        if (type == null) {
            changed = true;
        } else if (field.type() == type) {
            projectedFields.put(field.name(), field);
        } else if (field.isOptional()) {
            changed = true;
            projectedFields.put(field.name(), Types.NestedField.optional(field.fieldId(), field.name(), type));
        } else {
            changed = true;
            projectedFields.put(field.name(), Types.NestedField.required(field.fieldId(), field.name(), type));
        }
    }
    // Construct a new struct with the projected struct's order
    boolean reordered = false;
    StructField[] requestedFields = requestedStruct.fields();
    List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(requestedFields.length);
    for (int i = 0; i < requestedFields.length; i += 1) {
        // fields are resolved by name because Spark only sees the current table schema.
        String name = requestedFields[i].name();
        if (!fields.get(i).name().equals(name)) {
            reordered = true;
        }
        newFields.add(projectedFields.remove(name));
    }
    // Add remaining filter fields that were not explicitly projected
    if (!projectedFields.isEmpty()) {
        newFields.addAll(projectedFields.values());
        // order probably changed
        changed = true;
    }
    if (reordered || changed) {
        return Types.StructType.of(newFields);
    }
    return struct;
}
Also used : Types(org.apache.iceberg.types.Types) StructType(org.apache.spark.sql.types.StructType) BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) FloatType(org.apache.spark.sql.types.FloatType) DecimalType(org.apache.spark.sql.types.DecimalType) ArrayType(org.apache.spark.sql.types.ArrayType) DoubleType(org.apache.spark.sql.types.DoubleType) StructType(org.apache.spark.sql.types.StructType) IntegerType(org.apache.spark.sql.types.IntegerType) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) TimestampType(org.apache.spark.sql.types.TimestampType) Type(org.apache.iceberg.types.Type) BooleanType(org.apache.spark.sql.types.BooleanType) DateType(org.apache.spark.sql.types.DateType) MapType(org.apache.spark.sql.types.MapType) StructField(org.apache.spark.sql.types.StructField)

Example 25 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class PruneColumnsWithReordering method field.

@Override
public Type field(Types.NestedField field, Supplier<Type> fieldResult) {
    Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
    StructType requestedStruct = (StructType) current;
    // fields are resolved by name because Spark only sees the current table schema.
    if (requestedStruct.getFieldIndex(field.name()).isEmpty()) {
        // make sure that filter fields are projected even if they aren't in the requested schema.
        if (filterRefs.contains(field.fieldId())) {
            return field.type();
        }
        return null;
    }
    int fieldIndex = requestedStruct.fieldIndex(field.name());
    StructField requestedField = requestedStruct.fields()[fieldIndex];
    Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), "Cannot project an optional field as non-null: %s", field.name());
    this.current = requestedField.dataType();
    try {
        return fieldResult.get();
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("Invalid projection for field " + field.name() + ": " + e.getMessage(), e);
    } finally {
        this.current = requestedStruct;
    }
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25