Search in sources :

Example 26 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class IcebergSourceBenchmark method appendAsFile.

protected void appendAsFile(Dataset<Row> ds) {
    // ensure the schema is precise (including nullability)
    StructType sparkSchema = SparkSchemaUtil.convert(table.schema());
    spark.createDataFrame(ds.rdd(), sparkSchema).coalesce(1).write().format("iceberg").mode(SaveMode.Append).save(table.location());
}
Also used : StructType(org.apache.spark.sql.types.StructType)

Example 27 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class BaseRewriteManifestsSparkAction method writeManifestsForUnpartitionedTable.

private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) {
    Broadcast<FileIO> io = sparkContext().broadcast(fileIO);
    StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();
    // we rely only on the target number of manifests for unpartitioned tables
    // as we should not worry about having too much metadata per partition
    long maxNumManifestEntries = Long.MAX_VALUE;
    return manifestEntryDF.repartition(numManifests).mapPartitions(toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), manifestEncoder).collectAsList();
}
Also used : StructType(org.apache.spark.sql.types.StructType) FileIO(org.apache.iceberg.io.FileIO)

Example 28 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class BaseRewriteManifestsSparkAction method writeManifestsForPartitionedTable.

private List<ManifestFile> writeManifestsForPartitionedTable(Dataset<Row> manifestEntryDF, int numManifests, int targetNumManifestEntries) {
    Broadcast<FileIO> io = sparkContext().broadcast(fileIO);
    StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();
    // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough
    long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries);
    return withReusableDS(manifestEntryDF, df -> {
        Column partitionColumn = df.col("data_file.partition");
        return df.repartitionByRange(numManifests, partitionColumn).sortWithinPartitions(partitionColumn).mapPartitions(toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), manifestEncoder).collectAsList();
    });
}
Also used : StructType(org.apache.spark.sql.types.StructType) Column(org.apache.spark.sql.Column) FileIO(org.apache.iceberg.io.FileIO)

Example 29 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class BaseTableCreationSparkAction method stageDestTable.

protected StagedSparkTable stageDestTable() {
    try {
        Map<String, String> props = destTableProps();
        StructType schema = sourceTable.schema();
        Transform[] partitioning = sourceTable.partitioning();
        return (StagedSparkTable) destCatalog().stageCreate(destTableIdent(), schema, partitioning, props);
    } catch (org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException e) {
        throw new NoSuchNamespaceException("Cannot create table %s as the namespace does not exist", destTableIdent());
    } catch (org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException e) {
        throw new AlreadyExistsException("Cannot create table %s as it already exists", destTableIdent());
    }
}
Also used : StructType(org.apache.spark.sql.types.StructType) AlreadyExistsException(org.apache.iceberg.exceptions.AlreadyExistsException) NoSuchNamespaceException(org.apache.iceberg.exceptions.NoSuchNamespaceException) StagedSparkTable(org.apache.iceberg.spark.source.StagedSparkTable) Transform(org.apache.spark.sql.connector.expressions.Transform)

Example 30 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class ParquetWithSparkSchemaVisitor method visit.

public static <T> T visit(DataType sType, Type type, ParquetWithSparkSchemaVisitor<T> visitor) {
    Preconditions.checkArgument(sType != null, "Invalid DataType: null");
    if (type instanceof MessageType) {
        Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType);
        StructType struct = (StructType) sType;
        return visitor.message(struct, (MessageType) type, visitFields(struct, type.asGroupType(), visitor));
    } else if (type.isPrimitive()) {
        return visitor.primitive(sType, type.asPrimitiveType());
    } else {
        // if not a primitive, the typeId must be a group
        GroupType group = type.asGroupType();
        OriginalType annotation = group.getOriginalType();
        if (annotation != null) {
            switch(annotation) {
                case LIST:
                    Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), "Invalid list: top-level group is repeated: %s", group);
                    Preconditions.checkArgument(group.getFieldCount() == 1, "Invalid list: does not contain single repeated field: %s", group);
                    GroupType repeatedElement = group.getFields().get(0).asGroupType();
                    Preconditions.checkArgument(repeatedElement.isRepetition(Repetition.REPEATED), "Invalid list: inner group is not repeated");
                    Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, "Invalid list: repeated group is not a single field: %s", group);
                    Preconditions.checkArgument(sType instanceof ArrayType, "Invalid list: %s is not an array", sType);
                    ArrayType array = (ArrayType) sType;
                    StructField element = new StructField("element", array.elementType(), array.containsNull(), Metadata.empty());
                    visitor.fieldNames.push(repeatedElement.getName());
                    try {
                        T elementResult = null;
                        if (repeatedElement.getFieldCount() > 0) {
                            elementResult = visitField(element, repeatedElement.getType(0), visitor);
                        }
                        return visitor.list(array, group, elementResult);
                    } finally {
                        visitor.fieldNames.pop();
                    }
                case MAP:
                    Preconditions.checkArgument(!group.isRepetition(Repetition.REPEATED), "Invalid map: top-level group is repeated: %s", group);
                    Preconditions.checkArgument(group.getFieldCount() == 1, "Invalid map: does not contain single repeated field: %s", group);
                    GroupType repeatedKeyValue = group.getType(0).asGroupType();
                    Preconditions.checkArgument(repeatedKeyValue.isRepetition(Repetition.REPEATED), "Invalid map: inner group is not repeated");
                    Preconditions.checkArgument(repeatedKeyValue.getFieldCount() <= 2, "Invalid map: repeated group does not have 2 fields");
                    Preconditions.checkArgument(sType instanceof MapType, "Invalid map: %s is not a map", sType);
                    MapType map = (MapType) sType;
                    StructField keyField = new StructField("key", map.keyType(), false, Metadata.empty());
                    StructField valueField = new StructField("value", map.valueType(), map.valueContainsNull(), Metadata.empty());
                    visitor.fieldNames.push(repeatedKeyValue.getName());
                    try {
                        T keyResult = null;
                        T valueResult = null;
                        switch(repeatedKeyValue.getFieldCount()) {
                            case 2:
                                // if there are 2 fields, both key and value are projected
                                keyResult = visitField(keyField, repeatedKeyValue.getType(0), visitor);
                                valueResult = visitField(valueField, repeatedKeyValue.getType(1), visitor);
                                break;
                            case 1:
                                // if there is just one, use the name to determine what it is
                                Type keyOrValue = repeatedKeyValue.getType(0);
                                if (keyOrValue.getName().equalsIgnoreCase("key")) {
                                    keyResult = visitField(keyField, keyOrValue, visitor);
                                // value result remains null
                                } else {
                                    valueResult = visitField(valueField, keyOrValue, visitor);
                                // key result remains null
                                }
                                break;
                            default:
                        }
                        return visitor.map(map, group, keyResult, valueResult);
                    } finally {
                        visitor.fieldNames.pop();
                    }
                default:
            }
        }
        Preconditions.checkArgument(sType instanceof StructType, "Invalid struct: %s is not a struct", sType);
        StructType struct = (StructType) sType;
        return visitor.struct(struct, group, visitFields(struct, group, visitor));
    }
}
Also used : ArrayType(org.apache.spark.sql.types.ArrayType) OriginalType(org.apache.parquet.schema.OriginalType) PrimitiveType(org.apache.parquet.schema.PrimitiveType) DataType(org.apache.spark.sql.types.DataType) StructType(org.apache.spark.sql.types.StructType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) ArrayType(org.apache.spark.sql.types.ArrayType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) MapType(org.apache.spark.sql.types.MapType) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) MapType(org.apache.spark.sql.types.MapType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25