Search in sources :

Example 1 with GenericInternalRow

use of org.apache.spark.sql.catalyst.expressions.GenericInternalRow in project iceberg by apache.

the class TestSparkPartitioningWriters method toRow.

@Override
protected InternalRow toRow(Integer id, String data) {
    InternalRow row = new GenericInternalRow(2);
    row.update(0, id);
    row.update(1, UTF8String.fromString(data));
    return row;
}
Also used : GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 2 with GenericInternalRow

use of org.apache.spark.sql.catalyst.expressions.GenericInternalRow in project iceberg by apache.

the class TestSparkPositionDeltaWriters method toRow.

@Override
protected InternalRow toRow(Integer id, String data) {
    InternalRow row = new GenericInternalRow(2);
    row.update(0, id);
    row.update(1, UTF8String.fromString(data));
    return row;
}
Also used : GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 3 with GenericInternalRow

use of org.apache.spark.sql.catalyst.expressions.GenericInternalRow in project iceberg by apache.

the class TestSparkRollingFileWriters method toRow.

@Override
protected InternalRow toRow(Integer id, String data) {
    InternalRow row = new GenericInternalRow(2);
    row.update(0, id);
    row.update(1, UTF8String.fromString(data));
    return row;
}
Also used : GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 4 with GenericInternalRow

use of org.apache.spark.sql.catalyst.expressions.GenericInternalRow in project iceberg by apache.

the class TestPartitionPruning method createTestDataset.

private Dataset<Row> createTestDataset() {
    List<InternalRow> rows = LOGS.stream().map(logMessage -> {
        Object[] underlying = new Object[] { logMessage.getId(), UTF8String.fromString(logMessage.getDate()), UTF8String.fromString(logMessage.getLevel()), UTF8String.fromString(logMessage.getMessage()), // discard the nanoseconds part to simplify
        TimeUnit.MILLISECONDS.toMicros(logMessage.getTimestamp().toEpochMilli()) };
        return new GenericInternalRow(underlying);
    }).collect(Collectors.toList());
    JavaRDD<InternalRow> rdd = sparkContext.parallelize(rows);
    Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(LOG_SCHEMA), false);
    return df.selectExpr("id", "date", "level", "message", "timestamp").selectExpr("id", "date", "level", "message", "timestamp", "bucket3(id) AS bucket_id", "truncate5(message) AS truncated_message", "hour(timestamp) AS ts_hour");
}
Also used : Arrays(java.util.Arrays) Types(org.apache.iceberg.types.Types) Random(java.util.Random) SparkReadOptions(org.apache.iceberg.spark.SparkReadOptions) Transform(org.apache.iceberg.transforms.Transform) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) UTF8String(org.apache.spark.unsafe.types.UTF8String) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Parameterized(org.junit.runners.Parameterized) Literal(org.apache.iceberg.expressions.Literal) DataTypes(org.apache.spark.sql.types.DataTypes) Transforms(org.apache.iceberg.transforms.Transforms) AfterClass(org.junit.AfterClass) Predicate(java.util.function.Predicate) Timestamp(java.sql.Timestamp) HadoopTables(org.apache.iceberg.hadoop.HadoopTables) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Instant(java.time.Instant) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) Sets(org.apache.iceberg.relocated.com.google.common.collect.Sets) List(java.util.List) PartitionSpec(org.apache.iceberg.PartitionSpec) TableProperties(org.apache.iceberg.TableProperties) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Dataset(org.apache.spark.sql.Dataset) BeforeClass(org.junit.BeforeClass) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) RunWith(org.junit.runner.RunWith) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) Table(org.apache.iceberg.Table) IOException(java.io.IOException) Test(org.junit.Test) Row(org.apache.spark.sql.Row) SparkSchemaUtil(org.apache.iceberg.spark.SparkSchemaUtil) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) Rule(org.junit.Rule) Assert(org.junit.Assert) TemporaryFolder(org.junit.rules.TemporaryFolder) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Row(org.apache.spark.sql.Row) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 5 with GenericInternalRow

use of org.apache.spark.sql.catalyst.expressions.GenericInternalRow in project iceberg by apache.

the class TestSparkFileWriterFactory method toRow.

@Override
protected InternalRow toRow(Integer id, String data) {
    InternalRow row = new GenericInternalRow(2);
    row.update(0, id);
    row.update(1, UTF8String.fromString(data));
    return row;
}
Also used : GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Aggregations

GenericInternalRow (org.apache.spark.sql.catalyst.expressions.GenericInternalRow)25 InternalRow (org.apache.spark.sql.catalyst.InternalRow)19 StructType (org.apache.spark.sql.types.StructType)10 Test (org.junit.Test)9 GenericData (org.apache.avro.generic.GenericData)7 Schema (org.apache.avro.Schema)6 Test (org.junit.jupiter.api.Test)4 ProtoRows (com.google.cloud.bigquery.storage.v1.ProtoRows)2 ProtobufUtils.toProtoRows (com.google.cloud.spark.bigquery.ProtobufUtils.toProtoRows)2 List (java.util.List)2 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)2 Path (org.apache.hadoop.fs.Path)2 SparkSession (org.apache.spark.sql.SparkSession)2 StructField (org.apache.spark.sql.types.StructField)2 OpenLineage (io.openlineage.client.OpenLineage)1 SparkAgentTestExtension (io.openlineage.spark.agent.SparkAgentTestExtension)1 OpenLineageClient (io.openlineage.spark.agent.client.OpenLineageClient)1 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)1 DatasetFactory (io.openlineage.spark.api.DatasetFactory)1 File (java.io.File)1