Search in sources :

Example 26 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkPositionDeltaWriters method toSet.

@Override
protected StructLikeSet toSet(Iterable<InternalRow> rows) {
    StructLikeSet set = StructLikeSet.create(table.schema().asStruct());
    StructType sparkType = SparkSchemaUtil.convert(table.schema());
    for (InternalRow row : rows) {
        InternalRowWrapper wrapper = new InternalRowWrapper(sparkType);
        set.add(wrapper.wrap(row));
    }
    return set;
}
Also used : StructType(org.apache.spark.sql.types.StructType) StructLikeSet(org.apache.iceberg.util.StructLikeSet) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 27 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkRollingFileWriters method toRow.

@Override
protected InternalRow toRow(Integer id, String data) {
    InternalRow row = new GenericInternalRow(2);
    row.update(0, id);
    row.update(1, UTF8String.fromString(data));
    return row;
}
Also used : GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 28 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestPartitionPruning method createTestDataset.

private Dataset<Row> createTestDataset() {
    List<InternalRow> rows = LOGS.stream().map(logMessage -> {
        Object[] underlying = new Object[] { logMessage.getId(), UTF8String.fromString(logMessage.getDate()), UTF8String.fromString(logMessage.getLevel()), UTF8String.fromString(logMessage.getMessage()), // discard the nanoseconds part to simplify
        TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) };
        return new GenericInternalRow(underlying);
    }).collect(Collectors.toList());
    JavaRDD<InternalRow> rdd = sparkContext.parallelize(rows);
    Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false);
    return df.selectExpr("id", "date", "level", "message", "timestamp").selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour");
}
Also used : Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) Random(java.util.Random) SparkReadOptions(org.apache.iceberg.spark.SparkReadOptions) Transform(org.apache.iceberg.transforms.Transform) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) UTF8String(org.apache.spark.unsafe.types.UTF8String) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Parameterized(org.junit.runners.Parameterized) Literal(org.apache.iceberg.expressions.Literal) DataTypes(org.apache.spark.sql.types.DataTypes) Transforms(org.apache.iceberg.transforms.Transforms) AfterClass(org.junit.AfterClass) Predicate(java.util.function.Predicate) Timestamp(java.sql.Timestamp) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Instant(java.time.Instant) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) List(java.util.List) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Dataset(org.apache.spark.sql.Dataset) BeforeClass(org.junit.BeforeClass) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) RunWith(org.junit.runner.RunWith) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) Table(org.apache.iceberg.Table) IOException(java.io.IOException) Test(org.junit.Test) Row(org.apache.spark.sql.Row) SparkSchemaUtil(org.apache.iceberg.spark.SparkSchemaUtil) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) Rule(org.junit.Rule) Assert(org.junit.Assert) TemporaryFolder(org.junit.rules.TemporaryFolder) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 29 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkAppenderFactory method expectedRowSet.

@Override
protected StructLikeSet expectedRowSet(Iterable<InternalRow> rows) {
    StructLikeSet set = StructLikeSet.create(table.schema().asStruct());
    for (InternalRow row : rows) {
        InternalRowWrapper wrapper = new InternalRowWrapper(sparkType);
        set.add(wrapper.wrap(row));
    }
    return set;
}
Also used : StructLikeSet(org.apache.iceberg.util.StructLikeSet) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 30 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkDataFile method checkSparkDataFile.

private void checkSparkDataFile(Table table) throws IOException {
    Iterable<InternalRow> rows = RandomData.generateSpark(table.schema(), 200, 0);
    JavaRDD<InternalRow> rdd = sparkContext.parallelize(Lists.newArrayList(rows));
    Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false);
    df.write().format("iceberg").mode("append").save(tableLocation);
    table.refresh();
    List<ManifestFile> manifests = table.currentSnapshot().allManifests();
    Assert.assertEquals("Should have 1 manifest", 1, manifests.size());
    List<DataFile> dataFiles = Lists.newArrayList();
    try (ManifestReader<DataFile> reader = ManifestFiles.read(manifests.get(0), table.io())) {
        for (DataFile dataFile : reader) {
            checkDataFile(dataFile.copy(), DataFiles.builder(table.spec()).copy(dataFile).build());
            dataFiles.add(dataFile.copy());
        }
    }
    Dataset<Row> dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files");
    // reorder columns to test arbitrary projections
    List<Column> columns = Arrays.stream(dataFileDF.columns()).map(ColumnName::new).collect(Collectors.toList());
    Collections.shuffle(columns);
    List<Row> sparkDataFiles = dataFileDF.select(Iterables.toArray(columns, Column.class)).collectAsList();
    Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size());
    Types.StructType dataFileType = DataFile.getType(table.spec().partitionType());
    StructType sparkDataFileType = sparkDataFiles.get(0).schema();
    SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType);
    for (int i = 0; i < dataFiles.size(); i++) {
        checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i)));
    }
}
Also used : Types(org.apache.iceberg.types.Types) SparkDataFile(org.apache.iceberg.spark.SparkDataFile) StructType(org.apache.spark.sql.types.StructType) ManifestFile(org.apache.iceberg.ManifestFile) DataFile(org.apache.iceberg.DataFile) SparkDataFile(org.apache.iceberg.spark.SparkDataFile) Column(org.apache.spark.sql.Column) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Aggregations

InternalRow (org.apache.spark.sql.catalyst.InternalRow)110 GenericInternalRow (org.apache.spark.sql.catalyst.expressions.GenericInternalRow)33 Row (org.apache.spark.sql.Row)30 StructType (org.apache.spark.sql.types.StructType)29 Test (org.junit.Test)28 Schema (org.apache.iceberg.Schema)17 ArrayList (java.util.ArrayList)16 List (java.util.List)16 Test (org.junit.jupiter.api.Test)14 File (java.io.File)13 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)13 IOException (java.io.IOException)12 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)12 Types (org.apache.iceberg.types.Types)12 OutputFileFactory (org.apache.iceberg.io.OutputFileFactory)11 GenericRecord (org.apache.avro.generic.GenericRecord)10 HoodieKey (org.apache.hudi.common.model.HoodieKey)10 FileAppender (org.apache.iceberg.io.FileAppender)10 Map (java.util.Map)9 Assert (org.junit.Assert)9