Search in sources :

Example 16 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkOrcReader method writeAndValidateRepeatingRecords.

@Test
public void writeAndValidateRepeatingRecords() throws IOException {
    Schema structSchema = new Schema(required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get()));
    List<InternalRow> expectedRepeating = Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next());
    writeAndValidateRecords(structSchema, expectedRepeating);
}
Also used : Schema(org.apache.iceberg.Schema) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Test(org.junit.Test)

Example 17 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkParquetReadMetadataColumns method testReadRowNumbersWithDelete.

@Test
public void testReadRowNumbersWithDelete() throws IOException {
    if (vectorized) {
        List<InternalRow> expectedRowsAfterDelete = Lists.newArrayList(EXPECTED_ROWS);
        // remove row at position 98, 99, 100, 101, 102, this crosses two row groups [0, 100) and [100, 200)
        for (int i = 1; i <= 5; i++) {
            expectedRowsAfterDelete.remove(98);
        }
        Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA);
        DeleteFilter deleteFilter = mock(DeleteFilter.class);
        when(deleteFilter.hasPosDeletes()).thenReturn(true);
        PositionDeleteIndex deletedRowPos = new CustomizedPositionDeleteIndex();
        deletedRowPos.delete(98, 103);
        when(deleteFilter.deletedRowPositions()).thenReturn(deletedRowPos);
        builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED, Maps.newHashMap(), deleteFilter));
        builder.recordsPerBatch(RECORDS_PER_BATCH);
        validate(expectedRowsAfterDelete, builder);
    }
}
Also used : Parquet(org.apache.iceberg.parquet.Parquet) PositionDeleteIndex(org.apache.iceberg.deletes.PositionDeleteIndex) DeleteFilter(org.apache.iceberg.data.DeleteFilter) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Test(org.junit.Test)

Example 18 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkParquetReadMetadataColumns method validate.

private void validate(List<InternalRow> expected, Parquet.ReadBuilder builder) throws IOException {
    try (CloseableIterable<InternalRow> reader = vectorized ? batchesToRows(builder.build()) : builder.build()) {
        final Iterator<InternalRow> actualRows = reader.iterator();
        for (InternalRow internalRow : expected) {
            Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
            TestHelpers.assertEquals(PROJECTION_SCHEMA, internalRow, actualRows.next());
        }
        Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
    }
}
Also used : InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow)

Example 19 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkParquetReader method testInt96TimestampProducedBySparkIsReadCorrectly.

@Test
public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException {
    String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet");
    HadoopOutputFile outputFile = HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(outputFilePath), new Configuration());
    Schema schema = new Schema(required(1, "ts", Types.TimestampType.withZone()));
    StructType sparkSchema = new StructType(new StructField[] { new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) });
    List<InternalRow> rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L));
    try (FileAppender<InternalRow> writer = new ParquetWriteAdapter<>(new NativeSparkWriterBuilder(outputFile).set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()).set("spark.sql.parquet.writeLegacyFormat", "false").set("spark.sql.parquet.outputTimestampType", "INT96").build(), MetricsConfig.getDefault())) {
        writer.addAll(rows);
    }
    InputFile parquetInputFile = Files.localInput(outputFilePath);
    List<InternalRow> readRows = rowsFromFile(parquetInputFile, schema);
    Assert.assertEquals(rows.size(), readRows.size());
    Assert.assertThat(readRows, CoreMatchers.is(rows));
    // Now we try to import that file as an Iceberg table to make sure Iceberg can read
    // Int96 end to end.
    Table int96Table = tableFromInputFile(parquetInputFile, schema);
    List<Record> tableRecords = Lists.newArrayList(IcebergGenerics.read(int96Table).build());
    Assert.assertEquals(rows.size(), tableRecords.size());
    for (int i = 0; i < tableRecords.size(); i++) {
        GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i));
    }
}
Also used : Table(org.apache.iceberg.Table) Configuration(org.apache.hadoop.conf.Configuration) StructType(org.apache.spark.sql.types.StructType) Schema(org.apache.iceberg.Schema) ParquetWriteAdapter(org.apache.iceberg.parquet.ParquetWriteAdapter) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) InputFile(org.apache.iceberg.io.InputFile) StructField(org.apache.spark.sql.types.StructField) Record(org.apache.iceberg.data.Record) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Test(org.junit.Test)

Example 20 with InternalRow

use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.

the class TestSparkRecordOrcReaderWriter method writeAndValidate.

private void writeAndValidate(Schema schema, List<Record> expectedRecords) throws IOException {
    final File originalFile = temp.newFile();
    Assert.assertTrue("Delete should succeed", originalFile.delete());
    // Write few generic records into the original test file.
    try (FileAppender<Record> writer = ORC.write(Files.localOutput(originalFile)).createWriterFunc(GenericOrcWriter::buildWriter).schema(schema).build()) {
        writer.addAll(expectedRecords);
    }
    // Read into spark InternalRow from the original test file.
    List<InternalRow> internalRows = Lists.newArrayList();
    try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(originalFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
        reader.forEach(internalRows::add);
        assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size());
    }
    final File anotherFile = temp.newFile();
    Assert.assertTrue("Delete should succeed", anotherFile.delete());
    // Write those spark InternalRows into a new file again.
    try (FileAppender<InternalRow> writer = ORC.write(Files.localOutput(anotherFile)).createWriterFunc(SparkOrcWriter::new).schema(schema).build()) {
        writer.addAll(internalRows);
    }
    // Check whether the InternalRows are expected records.
    try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(anotherFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
        assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size());
    }
    // Read into iceberg GenericRecord and check again.
    try (CloseableIterable<Record> reader = ORC.read(Files.localInput(anotherFile)).createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)).project(schema).build()) {
        assertRecordEquals(expectedRecords, reader, expectedRecords.size());
    }
}
Also used : GenericOrcReader(org.apache.iceberg.data.orc.GenericOrcReader) InternalRow(org.apache.spark.sql.catalyst.InternalRow) Types(org.apache.iceberg.types.Types) Iterator(java.util.Iterator) CloseableIterable(org.apache.iceberg.io.CloseableIterable) IOException(java.io.IOException) Test(org.junit.Test) Schema(org.apache.iceberg.Schema) GenericOrcWriter(org.apache.iceberg.data.orc.GenericOrcWriter) ORC(org.apache.iceberg.orc.ORC) File(java.io.File) RandomGenericData(org.apache.iceberg.data.RandomGenericData) BigDecimal(java.math.BigDecimal) List(java.util.List) Record(org.apache.iceberg.data.Record) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) NestedField.required(org.apache.iceberg.types.Types.NestedField.required) GenericRecord(org.apache.iceberg.data.GenericRecord) Assert(org.junit.Assert) Files(org.apache.iceberg.Files) FileAppender(org.apache.iceberg.io.FileAppender) GenericOrcWriter(org.apache.iceberg.data.orc.GenericOrcWriter) Record(org.apache.iceberg.data.Record) GenericRecord(org.apache.iceberg.data.GenericRecord) File(java.io.File) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Aggregations

InternalRow (org.apache.spark.sql.catalyst.InternalRow)110 GenericInternalRow (org.apache.spark.sql.catalyst.expressions.GenericInternalRow)33 Row (org.apache.spark.sql.Row)30 StructType (org.apache.spark.sql.types.StructType)29 Test (org.junit.Test)28 Schema (org.apache.iceberg.Schema)17 ArrayList (java.util.ArrayList)16 List (java.util.List)16 Test (org.junit.jupiter.api.Test)14 File (java.io.File)13 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)13 IOException (java.io.IOException)12 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)12 Types (org.apache.iceberg.types.Types)12 OutputFileFactory (org.apache.iceberg.io.OutputFileFactory)11 GenericRecord (org.apache.avro.generic.GenericRecord)10 HoodieKey (org.apache.hudi.common.model.HoodieKey)10 FileAppender (org.apache.iceberg.io.FileAppender)10 Map (java.util.Map)9 Assert (org.junit.Assert)9