use of org.apache.iceberg.orc.ORC in project iceberg by apache.
the class BaseFileWriterFactory method newDataWriter.
@Override
public DataWriter<T> newDataWriter(EncryptedOutputFile file, PartitionSpec spec, StructLike partition) {
OutputFile outputFile = file.encryptingOutputFile();
EncryptionKeyMetadata keyMetadata = file.keyMetadata();
Map<String, String> properties = table.properties();
MetricsConfig metricsConfig = MetricsConfig.forTable(table);
try {
switch(dataFileFormat) {
case AVRO:
Avro.DataWriteBuilder avroBuilder = Avro.writeData(outputFile).schema(dataSchema).setAll(properties).metricsConfig(metricsConfig).withSpec(spec).withPartition(partition).withKeyMetadata(keyMetadata).withSortOrder(dataSortOrder).overwrite();
configureDataWrite(avroBuilder);
return avroBuilder.build();
case PARQUET:
Parquet.DataWriteBuilder parquetBuilder = Parquet.writeData(outputFile).schema(dataSchema).setAll(properties).metricsConfig(metricsConfig).withSpec(spec).withPartition(partition).withKeyMetadata(keyMetadata).withSortOrder(dataSortOrder).overwrite();
configureDataWrite(parquetBuilder);
return parquetBuilder.build();
case ORC:
ORC.DataWriteBuilder orcBuilder = ORC.writeData(outputFile).schema(dataSchema).setAll(properties).metricsConfig(metricsConfig).withSpec(spec).withPartition(partition).withKeyMetadata(keyMetadata).withSortOrder(dataSortOrder).overwrite();
configureDataWrite(orcBuilder);
return orcBuilder.build();
default:
throw new UnsupportedOperationException("Unsupported data file format: " + dataFileFormat);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
use of org.apache.iceberg.orc.ORC in project iceberg by apache.
the class FlinkAppenderFactory method newEqDeleteWriter.
@Override
public EqualityDeleteWriter<RowData> newEqDeleteWriter(EncryptedOutputFile outputFile, FileFormat format, StructLike partition) {
Preconditions.checkState(equalityFieldIds != null && equalityFieldIds.length > 0, "Equality field ids shouldn't be null or empty when creating equality-delete writer");
Preconditions.checkNotNull(eqDeleteRowSchema, "Equality delete row schema shouldn't be null when creating equality-delete writer");
MetricsConfig metricsConfig = MetricsConfig.fromProperties(props);
try {
switch(format) {
case AVRO:
return Avro.writeDeletes(outputFile.encryptingOutputFile()).createWriterFunc(ignore -> new FlinkAvroWriter(lazyEqDeleteFlinkSchema())).withPartition(partition).overwrite().setAll(props).rowSchema(eqDeleteRowSchema).withSpec(spec).withKeyMetadata(outputFile.keyMetadata()).equalityFieldIds(equalityFieldIds).buildEqualityWriter();
case ORC:
return ORC.writeDeletes(outputFile.encryptingOutputFile()).createWriterFunc((iSchema, typDesc) -> FlinkOrcWriter.buildWriter(flinkSchema, iSchema)).withPartition(partition).overwrite().setAll(props).rowSchema(eqDeleteRowSchema).withSpec(spec).withKeyMetadata(outputFile.keyMetadata()).equalityFieldIds(equalityFieldIds).buildEqualityWriter();
case PARQUET:
return Parquet.writeDeletes(outputFile.encryptingOutputFile()).createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(lazyEqDeleteFlinkSchema(), msgType)).withPartition(partition).overwrite().setAll(props).metricsConfig(metricsConfig).rowSchema(eqDeleteRowSchema).withSpec(spec).withKeyMetadata(outputFile.keyMetadata()).equalityFieldIds(equalityFieldIds).buildEqualityWriter();
default:
throw new UnsupportedOperationException("Cannot write equality-deletes for unsupported file format: " + format);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
use of org.apache.iceberg.orc.ORC in project iceberg by apache.
the class TestFlinkOrcReaderWriter method writeAndValidate.
@Override
protected void writeAndValidate(Schema schema) throws IOException {
RowType flinkSchema = FlinkSchemaUtil.convert(schema);
List<Record> expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L);
List<RowData> expectedRows = Lists.newArrayList(RandomRowData.convert(schema, expectedRecords));
File recordsFile = temp.newFile();
Assert.assertTrue("Delete should succeed", recordsFile.delete());
// Write the expected records into ORC file, then read them into RowData and assert with the expected Record list.
try (FileAppender<Record> writer = ORC.write(Files.localOutput(recordsFile)).schema(schema).createWriterFunc(GenericOrcWriter::buildWriter).build()) {
writer.addAll(expectedRecords);
}
try (CloseableIterable<RowData> reader = ORC.read(Files.localInput(recordsFile)).project(schema).createReaderFunc(type -> new FlinkOrcReader(schema, type)).build()) {
Iterator<Record> expected = expectedRecords.iterator();
Iterator<RowData> rows = reader.iterator();
for (int i = 0; i < NUM_RECORDS; i++) {
Assert.assertTrue("Should have expected number of records", rows.hasNext());
TestHelpers.assertRowData(schema.asStruct(), flinkSchema, expected.next(), rows.next());
}
Assert.assertFalse("Should not have extra records", rows.hasNext());
}
File rowDataFile = temp.newFile();
Assert.assertTrue("Delete should succeed", rowDataFile.delete());
// Write the expected RowData into ORC file, then read them into Record and assert with the expected RowData list.
RowType rowType = FlinkSchemaUtil.convert(schema);
try (FileAppender<RowData> writer = ORC.write(Files.localOutput(rowDataFile)).schema(schema).createWriterFunc((iSchema, typeDesc) -> FlinkOrcWriter.buildWriter(rowType, iSchema)).build()) {
writer.addAll(expectedRows);
}
try (CloseableIterable<Record> reader = ORC.read(Files.localInput(rowDataFile)).project(schema).createReaderFunc(type -> GenericOrcReader.buildReader(schema, type)).build()) {
Iterator<RowData> expected = expectedRows.iterator();
Iterator<Record> records = reader.iterator();
for (int i = 0; i < NUM_RECORDS; i += 1) {
Assert.assertTrue("Should have expected number of records", records.hasNext());
TestHelpers.assertRowData(schema.asStruct(), flinkSchema, records.next(), expected.next());
}
Assert.assertFalse("Should not have extra records", records.hasNext());
}
}
use of org.apache.iceberg.orc.ORC in project iceberg by apache.
the class TestGenericReadProjection method writeAndRead.
@Override
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException {
File file = temp.newFile(desc + ".orc");
file.delete();
try (FileAppender<Record> appender = ORC.write(Files.localOutput(file)).schema(writeSchema).createWriterFunc(GenericOrcWriter::buildWriter).build()) {
appender.add(record);
}
Iterable<Record> records = ORC.read(Files.localInput(file)).project(readSchema).createReaderFunc(fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema)).build();
return Iterables.getOnlyElement(records);
}
use of org.apache.iceberg.orc.ORC in project iceberg by apache.
the class TestOrcDataWriter method testDataWriter.
@Test
public void testDataWriter() throws IOException {
OutputFile file = Files.localOutput(temp.newFile());
SortOrder sortOrder = SortOrder.builderFor(SCHEMA).withOrderId(10).asc("id").build();
DataWriter<Record> dataWriter = ORC.writeData(file).schema(SCHEMA).createWriterFunc(GenericOrcWriter::buildWriter).overwrite().withSpec(PartitionSpec.unpartitioned()).withSortOrder(sortOrder).build();
try {
for (Record record : records) {
dataWriter.write(record);
}
} finally {
dataWriter.close();
}
DataFile dataFile = dataWriter.toDataFile();
Assert.assertEquals("Format should be ORC", FileFormat.ORC, dataFile.format());
Assert.assertEquals("Should be data file", FileContent.DATA, dataFile.content());
Assert.assertEquals("Record count should match", records.size(), dataFile.recordCount());
Assert.assertEquals("Partition should be empty", 0, dataFile.partition().size());
Assert.assertEquals("Sort order should match", sortOrder.orderId(), (int) dataFile.sortOrderId());
Assert.assertNull("Key metadata should be null", dataFile.keyMetadata());
List<Record> writtenRecords;
try (CloseableIterable<Record> reader = ORC.read(file.toInputFile()).project(SCHEMA).createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)).build()) {
writtenRecords = Lists.newArrayList(reader);
}
Assert.assertEquals("Written records should match", records, writtenRecords);
}
Aggregations