Search in sources :

Example 16 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class SparkTypeVisitor method visit.

static <T> T visit(DataType type, SparkTypeVisitor<T> visitor) {
    if (type instanceof StructType) {
        StructField[] fields = ((StructType) type).fields();
        List<T> fieldResults = Lists.newArrayListWithExpectedSize(fields.length);
        for (StructField field : fields) {
            fieldResults.add(visitor.field(field, visit(field.dataType(), visitor)));
        }
        return visitor.struct((StructType) type, fieldResults);
    } else if (type instanceof MapType) {
        return visitor.map((MapType) type, visit(((MapType) type).keyType(), visitor), visit(((MapType) type).valueType(), visitor));
    } else if (type instanceof ArrayType) {
        return visitor.array((ArrayType) type, visit(((ArrayType) type).elementType(), visitor));
    } else if (type instanceof UserDefinedType) {
        throw new UnsupportedOperationException("User-defined types are not supported");
    } else {
        return visitor.atomic(type);
    }
}
Also used : ArrayType(org.apache.spark.sql.types.ArrayType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) UserDefinedType(org.apache.spark.sql.types.UserDefinedType) MapType(org.apache.spark.sql.types.MapType)

Example 17 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class PruneColumnsWithoutReordering method field.

@Override
public Type field(Types.NestedField field, Supplier<Type> fieldResult) {
    Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
    StructType requestedStruct = (StructType) current;
    // fields are resolved by name because Spark only sees the current table schema.
    if (requestedStruct.getFieldIndex(field.name()).isEmpty()) {
        // make sure that filter fields are projected even if they aren't in the requested schema.
        if (filterRefs.contains(field.fieldId())) {
            return field.type();
        }
        return null;
    }
    int fieldIndex = requestedStruct.fieldIndex(field.name());
    StructField requestedField = requestedStruct.fields()[fieldIndex];
    Preconditions.checkArgument(requestedField.nullable() || field.isRequired(), "Cannot project an optional field as non-null: %s", field.name());
    this.current = requestedField.dataType();
    try {
        return fieldResult.get();
    } catch (IllegalArgumentException e) {
        throw new IllegalArgumentException("Invalid projection for field " + field.name() + ": " + e.getMessage(), e);
    } finally {
        this.current = requestedStruct;
    }
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType)

Example 18 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class PruneColumnsWithoutReordering method struct.

@Override
public Type struct(Types.StructType struct, Iterable<Type> fieldResults) {
    Preconditions.checkNotNull(struct, "Cannot prune null struct. Pruning must start with a schema.");
    Preconditions.checkArgument(current instanceof StructType, "Not a struct: %s", current);
    List<Types.NestedField> fields = struct.fields();
    List<Type> types = Lists.newArrayList(fieldResults);
    boolean changed = false;
    List<Types.NestedField> newFields = Lists.newArrayListWithExpectedSize(types.size());
    for (int i = 0; i < fields.size(); i += 1) {
        Types.NestedField field = fields.get(i);
        Type type = types.get(i);
        if (type == null) {
            changed = true;
        } else if (field.type() == type) {
            newFields.add(field);
        } else if (field.isOptional()) {
            changed = true;
            newFields.add(Types.NestedField.optional(field.fieldId(), field.name(), type));
        } else {
            changed = true;
            newFields.add(Types.NestedField.required(field.fieldId(), field.name(), type));
        }
    }
    if (changed) {
        return Types.StructType.of(newFields);
    }
    return struct;
}
Also used : Types(org.apache.iceberg.types.Types) BinaryType(org.apache.spark.sql.types.BinaryType) DataType(org.apache.spark.sql.types.DataType) FloatType(org.apache.spark.sql.types.FloatType) DecimalType(org.apache.spark.sql.types.DecimalType) ArrayType(org.apache.spark.sql.types.ArrayType) DoubleType(org.apache.spark.sql.types.DoubleType) StructType(org.apache.spark.sql.types.StructType) IntegerType(org.apache.spark.sql.types.IntegerType) StringType(org.apache.spark.sql.types.StringType) LongType(org.apache.spark.sql.types.LongType) TimestampType(org.apache.spark.sql.types.TimestampType) Type(org.apache.iceberg.types.Type) BooleanType(org.apache.spark.sql.types.BooleanType) DateType(org.apache.spark.sql.types.DateType) MapType(org.apache.spark.sql.types.MapType) StructType(org.apache.spark.sql.types.StructType)

Example 19 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class SparkSchemaUtil method schemaForTable.

/**
 * Returns a {@link Schema} for the given table with fresh field ids.
 * <p>
 * This creates a Schema for an existing table by looking up the table's schema with Spark and
 * converting that schema. Spark/Hive partition columns are included in the schema.
 *
 * @param spark a Spark session
 * @param name a table name and (optional) database
 * @return a Schema for the table, if found
 */
public static Schema schemaForTable(SparkSession spark, String name) {
    StructType sparkType = spark.table(name).schema();
    Type converted = SparkTypeVisitor.visit(sparkType, new SparkTypeToType(sparkType));
    return new Schema(converted.asNestedType().asStructType().fields());
}
Also used : DataType(org.apache.spark.sql.types.DataType) StructType(org.apache.spark.sql.types.StructType) Type(org.apache.iceberg.types.Type) StructType(org.apache.spark.sql.types.StructType) Schema(org.apache.iceberg.Schema)

Example 20 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class WritersBenchmark method writePartitionedClusteredDataWriter.

@Benchmark
@Threads(1)
public void writePartitionedClusteredDataWriter(Blackhole blackhole) throws IOException {
    FileIO io = table().io();
    OutputFileFactory fileFactory = newFileFactory();
    SparkFileWriterFactory writerFactory = SparkFileWriterFactory.builderFor(table()).dataFileFormat(fileFormat()).dataSchema(table().schema()).build();
    ClusteredDataWriter<InternalRow> writer = new ClusteredDataWriter<>(writerFactory, fileFactory, io, fileFormat(), TARGET_FILE_SIZE_IN_BYTES);
    PartitionKey partitionKey = new PartitionKey(partitionedSpec, table().schema());
    StructType dataSparkType = SparkSchemaUtil.convert(table().schema());
    InternalRowWrapper internalRowWrapper = new InternalRowWrapper(dataSparkType);
    try (ClusteredDataWriter<InternalRow> closeableWriter = writer) {
        for (InternalRow row : rows) {
            partitionKey.partition(internalRowWrapper.wrap(row));
            closeableWriter.write(row, partitionedSpec, partitionKey);
        }
    }
    blackhole.consume(writer);
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) StructType(org.apache.spark.sql.types.StructType) ClusteredDataWriter(org.apache.iceberg.io.ClusteredDataWriter) PartitionKey(org.apache.iceberg.PartitionKey) InternalRow(org.apache.spark.sql.catalyst.InternalRow) FileIO(org.apache.iceberg.io.FileIO) Threads(org.openjdk.jmh.annotations.Threads) Benchmark(org.openjdk.jmh.annotations.Benchmark)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25