use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkOrcReader method writeAndValidateRepeatingRecords.
@Test
public void writeAndValidateRepeatingRecords() throws IOException {
Schema structSchema = new Schema(required(100, "id", Types.LongType.get()), required(101, "data", Types.StringType.get()));
List<InternalRow> expectedRepeating = Collections.nCopies(100, RandomData.generateSpark(structSchema, 1, 0L).iterator().next());
writeAndValidateRecords(structSchema, expectedRepeating);
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkParquetReadMetadataColumns method testReadRowNumbersWithDelete.
@Test
public void testReadRowNumbersWithDelete() throws IOException {
if (vectorized) {
List<InternalRow> expectedRowsAfterDelete = Lists.newArrayList(EXPECTED_ROWS);
// remove row at position 98, 99, 100, 101, 102, this crosses two row groups [0, 100) and [100, 200)
for (int i = 1; i <= 5; i++) {
expectedRowsAfterDelete.remove(98);
}
Parquet.ReadBuilder builder = Parquet.read(Files.localInput(testFile)).project(PROJECTION_SCHEMA);
DeleteFilter deleteFilter = mock(DeleteFilter.class);
when(deleteFilter.hasPosDeletes()).thenReturn(true);
PositionDeleteIndex deletedRowPos = new CustomizedPositionDeleteIndex();
deletedRowPos.delete(98, 103);
when(deleteFilter.deletedRowPositions()).thenReturn(deletedRowPos);
builder.createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(PROJECTION_SCHEMA, fileSchema, NullCheckingForGet.NULL_CHECKING_ENABLED, Maps.newHashMap(), deleteFilter));
builder.recordsPerBatch(RECORDS_PER_BATCH);
validate(expectedRowsAfterDelete, builder);
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkParquetReadMetadataColumns method validate.
private void validate(List<InternalRow> expected, Parquet.ReadBuilder builder) throws IOException {
try (CloseableIterable<InternalRow> reader = vectorized ? batchesToRows(builder.build()) : builder.build()) {
final Iterator<InternalRow> actualRows = reader.iterator();
for (InternalRow internalRow : expected) {
Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
TestHelpers.assertEquals(PROJECTION_SCHEMA, internalRow, actualRows.next());
}
Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkParquetReader method testInt96TimestampProducedBySparkIsReadCorrectly.
@Test
public void testInt96TimestampProducedBySparkIsReadCorrectly() throws IOException {
String outputFilePath = String.format("%s/%s", temp.getRoot().getAbsolutePath(), "parquet_int96.parquet");
HadoopOutputFile outputFile = HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(outputFilePath), new Configuration());
Schema schema = new Schema(required(1, "ts", Types.TimestampType.withZone()));
StructType sparkSchema = new StructType(new StructField[] { new StructField("ts", DataTypes.TimestampType, true, Metadata.empty()) });
List<InternalRow> rows = Lists.newArrayList(RandomData.generateSpark(schema, 10, 0L));
try (FileAppender<InternalRow> writer = new ParquetWriteAdapter<>(new NativeSparkWriterBuilder(outputFile).set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json()).set("spark.sql.parquet.writeLegacyFormat", "false").set("spark.sql.parquet.outputTimestampType", "INT96").build(), MetricsConfig.getDefault())) {
writer.addAll(rows);
}
InputFile parquetInputFile = Files.localInput(outputFilePath);
List<InternalRow> readRows = rowsFromFile(parquetInputFile, schema);
Assert.assertEquals(rows.size(), readRows.size());
Assert.assertThat(readRows, CoreMatchers.is(rows));
// Now we try to import that file as an Iceberg table to make sure Iceberg can read
// Int96 end to end.
Table int96Table = tableFromInputFile(parquetInputFile, schema);
List<Record> tableRecords = Lists.newArrayList(IcebergGenerics.read(int96Table).build());
Assert.assertEquals(rows.size(), tableRecords.size());
for (int i = 0; i < tableRecords.size(); i++) {
GenericsHelpers.assertEqualsUnsafe(schema.asStruct(), tableRecords.get(i), rows.get(i));
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkRecordOrcReaderWriter method writeAndValidate.
private void writeAndValidate(Schema schema, List<Record> expectedRecords) throws IOException {
final File originalFile = temp.newFile();
Assert.assertTrue("Delete should succeed", originalFile.delete());
// Write few generic records into the original test file.
try (FileAppender<Record> writer = ORC.write(Files.localOutput(originalFile)).createWriterFunc(GenericOrcWriter::buildWriter).schema(schema).build()) {
writer.addAll(expectedRecords);
}
// Read into spark InternalRow from the original test file.
List<InternalRow> internalRows = Lists.newArrayList();
try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(originalFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
reader.forEach(internalRows::add);
assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size());
}
final File anotherFile = temp.newFile();
Assert.assertTrue("Delete should succeed", anotherFile.delete());
// Write those spark InternalRows into a new file again.
try (FileAppender<InternalRow> writer = ORC.write(Files.localOutput(anotherFile)).createWriterFunc(SparkOrcWriter::new).schema(schema).build()) {
writer.addAll(internalRows);
}
// Check whether the InternalRows are expected records.
try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(anotherFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
assertEqualsUnsafe(schema.asStruct(), expectedRecords, reader, expectedRecords.size());
}
// Read into iceberg GenericRecord and check again.
try (CloseableIterable<Record> reader = ORC.read(Files.localInput(anotherFile)).createReaderFunc(typeDesc -> GenericOrcReader.buildReader(schema, typeDesc)).project(schema).build()) {
assertRecordEquals(expectedRecords, reader, expectedRecords.size());
}
}
Aggregations