use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class CherrypickSnapshotProcedure method call.
@Override
public InternalRow[] call(InternalRow args) {
Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());
long snapshotId = args.getLong(1);
return modifyIcebergTable(tableIdent, table -> {
table.manageSnapshots().cherrypick(snapshotId).commit();
Snapshot currentSnapshot = table.currentSnapshot();
InternalRow outputRow = newInternalRow(snapshotId, currentSnapshot.snapshotId());
return new InternalRow[] { outputRow };
});
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestDataFileSerialization method testParquetWriterSplitOffsets.
@Test
public void testParquetWriterSplitOffsets() throws IOException {
Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
File parquetFile = new File(temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
FileAppender<InternalRow> writer = Parquet.write(Files.localOutput(parquetFile)).schema(DATE_SCHEMA).createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)).build();
try {
writer.addAll(records);
} finally {
writer.close();
}
Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
File dataFile = temp.newFile();
try (Output out = new Output(new FileOutputStream(dataFile))) {
kryo.writeClassAndObject(out, writer.splitOffsets());
}
try (Input in = new Input(new FileInputStream(dataFile))) {
kryo.readClassAndObject(in);
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkAvroEnums method writeAndValidateEnums.
@Test
public void writeAndValidateEnums() throws IOException {
org.apache.avro.Schema avroSchema = SchemaBuilder.record("root").fields().name("enumCol").type().nullable().enumeration("testEnum").symbols("SYMB1", "SYMB2").enumDefault("SYMB2").endRecord();
org.apache.avro.Schema enumSchema = avroSchema.getField("enumCol").schema().getTypes().get(0);
Record enumRecord1 = new GenericData.Record(avroSchema);
enumRecord1.put("enumCol", new GenericData.EnumSymbol(enumSchema, "SYMB1"));
Record enumRecord2 = new GenericData.Record(avroSchema);
enumRecord2.put("enumCol", new GenericData.EnumSymbol(enumSchema, "SYMB2"));
// null enum
Record enumRecord3 = new GenericData.Record(avroSchema);
List<Record> expected = ImmutableList.of(enumRecord1, enumRecord2, enumRecord3);
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
try (DataFileWriter<Record> writer = new DataFileWriter<>(new GenericDatumWriter<>())) {
writer.create(avroSchema, testFile);
writer.append(enumRecord1);
writer.append(enumRecord2);
writer.append(enumRecord3);
}
Schema schema = new Schema(AvroSchemaUtil.convert(avroSchema).asStructType().fields());
List<InternalRow> rows;
try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)).createReaderFunc(SparkAvroReader::new).project(schema).build()) {
rows = Lists.newArrayList(reader);
}
// Iceberg will return enums as strings, so we compare string values for the enum field
for (int i = 0; i < expected.size(); i += 1) {
String expectedEnumString = expected.get(i).get("enumCol") == null ? null : expected.get(i).get("enumCol").toString();
String sparkString = rows.get(i).getUTF8String(0) == null ? null : rows.get(i).getUTF8String(0).toString();
Assert.assertEquals(expectedEnumString, sparkString);
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkAvroReader method writeAndValidate.
@Override
protected void writeAndValidate(Schema schema) throws IOException {
List<Record> expected = RandomData.generateList(schema, 100, 0L);
File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)).schema(schema).named("test").build()) {
for (Record rec : expected) {
writer.add(rec);
}
}
List<InternalRow> rows;
try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)).createReaderFunc(SparkAvroReader::new).project(schema).build()) {
rows = Lists.newArrayList(reader);
}
for (int i = 0; i < expected.size(); i += 1) {
assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.get(i));
}
}
use of org.apache.spark.sql.catalyst.InternalRow in project iceberg by apache.
the class TestSparkOrcReader method writeAndValidateRecords.
private void writeAndValidateRecords(Schema schema, Iterable<InternalRow> expected) throws IOException {
final File testFile = temp.newFile();
Assert.assertTrue("Delete should succeed", testFile.delete());
try (FileAppender<InternalRow> writer = ORC.write(Files.localOutput(testFile)).createWriterFunc(SparkOrcWriter::new).schema(schema).build()) {
writer.addAll(expected);
}
try (CloseableIterable<InternalRow> reader = ORC.read(Files.localInput(testFile)).project(schema).createReaderFunc(readOrcSchema -> new SparkOrcReader(schema, readOrcSchema)).build()) {
final Iterator<InternalRow> actualRows = reader.iterator();
final Iterator<InternalRow> expectedRows = expected.iterator();
while (expectedRows.hasNext()) {
Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
assertEquals(schema, expectedRows.next(), actualRows.next());
}
Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
}
try (CloseableIterable<ColumnarBatch> reader = ORC.read(Files.localInput(testFile)).project(schema).createBatchedReaderFunc(readOrcSchema -> VectorizedSparkOrcReaders.buildReader(schema, readOrcSchema, ImmutableMap.of())).build()) {
final Iterator<InternalRow> actualRows = batchesToRows(reader.iterator());
final Iterator<InternalRow> expectedRows = expected.iterator();
while (expectedRows.hasNext()) {
Assert.assertTrue("Should have expected number of rows", actualRows.hasNext());
assertEquals(schema, expectedRows.next(), actualRows.next());
}
Assert.assertFalse("Should not have extra rows", actualRows.hasNext());
}
}
Aggregations