Search in sources :

Example 11 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class CherrypickSnapshotProcedure method call.

@Override
public InternalRow[] call(InternalRow args) {
    Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());
    long snapshotId = args.getLong(1);
    return modifyIcebergTable(tableIdent, table -> {
        table.manageSnapshots().cherrypick(snapshotId).commit();
        Snapshot currentSnapshot = table.currentSnapshot();
        InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId());
        return new InternalRow[] { outputRow };
    });
}
Also used : Snapshot(org.apache.iceberg.Snapshot) Identifier(org.apache.spark.sql.connector.catalog.Identifier) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Example 12 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestDataFileSerialization method testParquetWriterSplitOffsets.

@Test
public void testParquetWriterSplitOffsets() throws IOException {
    Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
    File parquetFile = new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
    FileAppender<InternalRow> writer = Parquet.write(Files.localOutput(parquetFile)).schema(DATE_SCHEMA).createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)).build();
    try {
        writer.addAll(records);
    } finally {
        writer.close();
    }
    Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
    File dataFile = temp.newFile();
    try (Output out = new Output(new FileOutputStream(dataFile))) {
        kryo.writeClassAndObject(out, writer.splitOffsets());
    }
    try (Input in = new Input(new FileInputStream(dataFile))) {
        kryo.readClassAndObject(in);
    }
}
Also used : Output(com.esotericsoftware.kryo.io.Output) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Types(org.apache.iceberg.types.Types) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ObjectInputStream(java.io.ObjectInputStream) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) SparkParquetWriters(org.apache.iceberg.spark.data.SparkParquetWriters) Kryo(com.esotericsoftware.kryo.Kryo) ByteBuffer(java.nio.ByteBuffer) ByteArrayInputStream(java.io.ByteArrayInputStream) Map(java.util.Map) Input(com.esotericsoftware.kryo.io.Input) Assertions(org.assertj.core.api.Assertions) ObjectOutputStream(java.io.ObjectOutputStream) FileAppender(org.apache.iceberg.io.FileAppender) RandomData(org.apache.iceberg.spark.data.RandomData) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) SparkConf(org.apache.spark.SparkConf) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) Parquet(org.apache.iceberg.parquet.Parquet) Test(org.junit.Test) FileInputStream(java.io.FileInputStream) UUID(java.util.UUID) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) KryoSerializer(org.apache.spark.serializer.KryoSerializer) SparkSchemaUtil(org.apache.iceberg.spark.SparkSchemaUtil) File(java.io.File) ByteOrder(java.nio.ByteOrder) Rule(org.junit.Rule) NestedField.required(org.apache.iceberg.types.Types.NestedField.required) Assert(org.junit.Assert) TemporaryFolder(org.junit.rules.TemporaryFolder) TaskCheckHelper.assertEquals(org.apache.iceberg.TaskCheckHelper.assertEquals) Input(com.esotericsoftware.kryo.io.Input) Output(com.esotericsoftware.kryo.io.Output) FileOutputStream(java.io.FileOutputStream) File(java.io.File) SparkConf(org.apache.spark.SparkConf) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Kryo(com.esotericsoftware.kryo.Kryo) FileInputStream(java.io.FileInputStream) KryoSerializer(org.apache.spark.serializer.KryoSerializer) Test(org.junit.Test)

Example 13 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkAvroEnums method writeAndValidateEnums.

@Test
public void writeAndValidateEnums() throws IOException {
    org.apache.avro.Schema avroSchema = SchemaBuilder.record("root").fields().name("enumCol").type().nullable().enumeration("testEnum").symbols("SYMB1", "SYMB2").enumDefault("SYMB2").endRecord();
    org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0);
    Record enumRecord1 = new GenericData.Record(avroSchema);
    enumRecord1.put("enumCol", new GenericData.EnumSymbol(enumSchema, "SYMB1"));
    Record enumRecord2 = new GenericData.Record(avroSchema);
    enumRecord2.put("enumCol", new GenericData.EnumSymbol(enumSchema, "SYMB2"));
    // null enum
    Record enumRecord3 = new GenericData.Record(avroSchema);
    List<Record> expected = ImmutableList.of(enumRecord1, enumRecord2, enumRecord3);
    File testFile = temp.newFile();
    Assert.assertTrue("Delete should succeed", testFile.delete());
    try (DataFileWriter<Record> writer = new DataFileWriter<>(new GenericDatumWriter<>())) {
        writer.create(avroSchema, testFile);
        writer.append(enumRecord1);
        writer.append(enumRecord2);
        writer.append(enumRecord3);
    }
    Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields());
    List<InternalRow> rows;
    try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)).createReaderFunc(SparkAvroReader::new).project(schema).build()) {
        rows = Lists.newArrayList(reader);
    }
    // Iceberg will return enums as strings, so we compare string values for the enum field
    for (int i = 0; i < expected.size(); i += 1) {
        String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString();
        String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString();
        Assert.assertEquals(expectedEnumString, sparkString);
    }
}
Also used : DataFileWriter(org.apache.avro.file.DataFileWriter) Schema(org.apache.iceberg.Schema) GenericData(org.apache.avro.generic.GenericData) Record(org.apache.avro.generic.GenericData.Record) File(java.io.File) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Test(org.junit.Test)

Example 14 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkAvroReader method writeAndValidate.

@Override
protected void writeAndValidate(Schema schema) throws IOException {
    List<Record> expected = RandomData.generateList(schema, 100, 0L);
    File testFile = temp.newFile();
    Assert.assertTrue("Delete should succeed", testFile.delete());
    try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) {
        for (Record rec : expected) {
            writer.add(rec);
        }
    }
    List<InternalRow> rows;
    try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)).createReaderFunc(SparkAvroReader::new).project(schema).build()) {
        rows = Lists.newArrayList(reader);
    }
    for (int i = 0; i < expected.size(); i += 1) {
        assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.get(i));
    }
}
Also used : Record(org.apache.avro.generic.GenericData.Record) File(java.io.File) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Example 15 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkOrcReader method writeAndValidateRecords.

private void writeAndValidateRecords(Schema schema, Iterable<InternalRow> expected) throws IOException {
    final File testFile = temp.newFile();
    Assert.assertTrue("Delete should succeed", testFile.delete());
    try (FileAppender<InternalRow> writer = ORC.write(Files.localOutput(testFile)).createWriterFunc(SparkOrcWriter::new).schema(schema).build()) {
        writer.addAll(expected);
    }
    try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(testFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
        final Iterator<InternalRow> actualRows = reader.iterator();
        final Iterator<InternalRow> expectedRows = expected.iterator();
        while (expectedRows.hasNext()) {
            Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
            assertEquals(schema, expectedRows.next(), actualRows.next());
        }
        Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
    }
    try (CloseableIterable<ColumnarBatch> reader = ORC.read(Files.localInput(testFile)).project(schema).createBatchedReaderFunc(readOrcSchema -> VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())).build()) {
        final Iterator<InternalRow> actualRows = batchesToRows(reader.iterator());
        final Iterator<InternalRow> expectedRows = expected.iterator();
        while (expectedRows.hasNext()) {
            Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
            assertEquals(schema, expectedRows.next(), actualRows.next());
        }
        Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
    }
}
Also used : InternalRow(org.apache.spark.sql.catalyst.InternalRow) Types(org.apache.iceberg.types.Types) Iterator(java.util.Iterator) CloseableIterable(org.apache.iceberg.io.CloseableIterable) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) IOException(java.io.IOException) Test(org.junit.Test) Schema(org.apache.iceberg.Schema) ORC(org.apache.iceberg.orc.ORC) File(java.io.File) VectorizedSparkOrcReaders(org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders) List(java.util.List) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) NestedField.required(org.apache.iceberg.types.Types.NestedField.required) TestHelpers.assertEquals(org.apache.iceberg.spark.data.TestHelpers.assertEquals) Iterators(org.apache.iceberg.relocated.com.google.common.collect.Iterators) Assert(org.junit.Assert) Collections(java.util.Collections) Files(org.apache.iceberg.Files) FileAppender(org.apache.iceberg.io.FileAppender) ColumnarBatch(org.apache.spark.sql.vectorized.ColumnarBatch) File(java.io.File) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Aggregations

InternalRow (org.apache.spark.sql.catalyst.InternalRow)110 GenericInternalRow (org.apache.spark.sql.catalyst.expressions.GenericInternalRow)33 Row (org.apache.spark.sql.Row)30 StructType (org.apache.spark.sql.types.StructType)29 Test (org.junit.Test)28 Schema (org.apache.iceberg.Schema)17 ArrayList (java.util.ArrayList)16 List (java.util.List)16 Test (org.junit.jupiter.api.Test)14 File (java.io.File)13 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)13 IOException (java.io.IOException)12 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)12 Types (org.apache.iceberg.types.Types)12 OutputFileFactory (org.apache.iceberg.io.OutputFileFactory)11 GenericRecord (org.apache.avro.generic.GenericRecord)10 HoodieKey (org.apache.hudi.common.model.HoodieKey)10 FileAppender (org.apache.iceberg.io.FileAppender)10 Map (java.util.Map)9 Assert (org.junit.Assert)9