Search in sources :

Example 11 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class TestSparkSchema method testSparkReadSchemaIsHonored.

@Test
public void testSparkReadSchemaIsHonored() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    tables.create(SCHEMA, spec, null, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    StructType sparkReadSchema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) });
    Dataset<Row> resultDf = spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation);
    Row[] results = (Row[]) resultDf.collect();
    Assert.assertEquals("Result size matches", 1, results.length);
    Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
    Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Row(org.apache.spark.sql.Row) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 12 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class TestSparkSchema method testSparkReadSchemaCombinedWithProjection.

@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    tables.create(SCHEMA, spec, null, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    StructType sparkReadSchema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), new StructField("data", DataTypes.StringType, true, Metadata.empty()) });
    Dataset<Row> resultDf = spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id");
    Row[] results = (Row[]) resultDf.collect();
    Assert.assertEquals("Result size matches", 1, results.length);
    Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
    Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Row(org.apache.spark.sql.Row) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 13 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class TestSparkSchema method testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection.

@Test
public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() throws IOException {
    String tableLocation = temp.newFolder("iceberg-table").toString();
    HadoopTables tables = new HadoopTables(CONF);
    PartitionSpec spec = PartitionSpec.unpartitioned();
    tables.create(SCHEMA, spec, null, tableLocation);
    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
    Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
    originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
    StructType sparkReadSchema = new StructType(new StructField[] { new StructField("data", DataTypes.StringType, true, Metadata.empty()) });
    AssertHelpers.assertThrows("Spark should not allow a projection that is not included in the read schema", org.apache.spark.sql.AnalysisException.class, "cannot resolve '`id`' given input columns: [data]", () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id"));
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) Row(org.apache.spark.sql.Row) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 14 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class Spark3Util method getPartitions.

/**
 * Use Spark to list all partitions in the table.
 *
 * @param spark a Spark session
 * @param rootPath a table identifier
 * @param format format of the file
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
    FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
    Map<String, String> emptyMap = Collections.emptyMap();
    InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
    org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
    StructType schema = spec.partitionColumns();
    return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
        Map<String, String> values = Maps.newHashMap();
        JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
            int fieldIndex = schema.fieldIndex(field.name());
            Object catalystValue = partition.values().get(fieldIndex, field.dataType());
            Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
            values.put(field.name(), String.valueOf(value));
        });
        return new SparkPartition(values, partition.path().toString(), format);
    }).collect(Collectors.toList());
}
Also used : FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache) WRITE_DISTRIBUTION_MODE_RANGE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) Distributions(org.apache.spark.sql.connector.iceberg.distributions.Distributions) Arrays(java.util.Arrays) DataSourceV2Relation(org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) MetadataTableUtils(org.apache.iceberg.MetadataTableUtils) UpdateSchema(org.apache.iceberg.UpdateSchema) PartitionSpecVisitor(org.apache.iceberg.transforms.PartitionSpecVisitor) ByteBuffer(java.nio.ByteBuffer) TableOperations(org.apache.iceberg.TableOperations) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) SortOrder(org.apache.spark.sql.connector.iceberg.expressions.SortOrder) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Term(org.apache.iceberg.expressions.Term) IntegerType(org.apache.spark.sql.types.IntegerType) Seq(scala.collection.Seq) SortOrderVisitor(org.apache.iceberg.transforms.SortOrderVisitor) Set(java.util.Set) LongType(org.apache.spark.sql.types.LongType) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) WRITE_DISTRIBUTION_MODE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) Type(org.apache.iceberg.types.Type) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) ExpressionVisitors(org.apache.iceberg.expressions.ExpressionVisitors) OrderedDistribution(org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution) Expressions(org.apache.spark.sql.connector.expressions.Expressions) DistributionMode(org.apache.iceberg.DistributionMode) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) Transform(org.apache.spark.sql.connector.expressions.Transform) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) TableChange(org.apache.spark.sql.connector.catalog.TableChange) Pair(org.apache.iceberg.util.Pair) SortOrderUtil(org.apache.iceberg.util.SortOrderUtil) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) BoundPredicate(org.apache.iceberg.expressions.BoundPredicate) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) Predef(scala.Predef) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NullOrder(org.apache.iceberg.NullOrder) Namespace(org.apache.iceberg.catalog.Namespace) SparkSession(org.apache.spark.sql.SparkSession) CatalystTypeConverters(org.apache.spark.sql.catalyst.CatalystTypeConverters) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Literal(org.apache.spark.sql.connector.expressions.Literal) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Distribution(org.apache.spark.sql.connector.iceberg.distributions.Distribution) Expression(org.apache.spark.sql.connector.expressions.Expression) CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) UnboundPredicate(org.apache.iceberg.expressions.UnboundPredicate) Identifier(org.apache.spark.sql.connector.catalog.Identifier) ParserInterface(org.apache.spark.sql.catalyst.parser.ParserInterface) WRITE_DISTRIBUTION_MODE_NONE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE) Collections(java.util.Collections) SparkTable(org.apache.iceberg.spark.source.SparkTable) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Table(org.apache.spark.sql.connector.catalog.Table) StructType(org.apache.spark.sql.types.StructType) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache)

Example 15 with StructType$

use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.

the class SparkTestTable method schema.

@Override
public StructType schema() {
    StructType schema = super.schema();
    if (metadataColumnNames != null) {
        for (String columnName : metadataColumnNames) {
            Types.NestedField metadataColumn = MetadataColumns.metadataColumn(table(), columnName);
            schema = schema.add(columnName, SparkSchemaUtil.convert(metadataColumn.type()));
        }
    }
    return schema;
}
Also used : Types(org.apache.iceberg.types.Types) StructType(org.apache.spark.sql.types.StructType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)418 StructField (org.apache.spark.sql.types.StructField)228 Row (org.apache.spark.sql.Row)200 ArrayList (java.util.ArrayList)152 Test (org.junit.Test)131 Script (org.apache.sysml.api.mlcontext.Script)68 SparkSession (org.apache.spark.sql.SparkSession)61 List (java.util.List)41 DataType (org.apache.spark.sql.types.DataType)40 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)36 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)34 DenseVector (org.apache.spark.ml.linalg.DenseVector)33 Map (java.util.Map)31 ArrayType (org.apache.spark.sql.types.ArrayType)30 Dataset (org.apache.spark.sql.Dataset)28 Tuple2 (scala.Tuple2)28 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)27 Vector (org.apache.spark.ml.linalg.Vector)27 IOException (java.io.IOException)26 InternalRow (org.apache.spark.sql.catalyst.InternalRow)25