Search in sources :

Example 1 with AvroFileDeltaInputWriter

use of org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter in project hudi by apache.

the class TestFileDeltaInputWriter method testAvroFileSinkCreateNewWriter.

@Test
public void testAvroFileSinkCreateNewWriter() throws IOException {
    // 1. Create a Avro File Sink Writer
    DeltaInputWriter<GenericRecord> fileSinkWriter = new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath, schemaProvider.getSourceSchema().toString(), 1024 * 1024L);
    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
    // 2. Generate 100 avro payloads and write them to an avro file
    IntStream.range(0, 100).forEach(a -> {
        try {
            fileSinkWriter.writeData(payloadGenerator.getNewPayload());
        } catch (IOException io) {
            throw new UncheckedIOException(io);
        }
    });
    fileSinkWriter.close();
    String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath();
    assertFalse(oldFilePath == null);
    DeltaInputWriter<GenericRecord> newFileSinkWriter = fileSinkWriter.getNewWriter();
    newFileSinkWriter.close();
    DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats();
    assertEquals(newStats.getBytesWritten(), 3674);
    assertEquals(newStats.getRecordsWritten(), 0);
    assertTrue(newStats.getFilePath() != null);
}
Also used : AvroFileDeltaInputWriter(org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter) GenericRecordFullPayloadGenerator(org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) GenericRecord(org.apache.avro.generic.GenericRecord) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) Test(org.junit.jupiter.api.Test)

Example 2 with AvroFileDeltaInputWriter

use of org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter in project hudi by apache.

the class TestFileDeltaInputWriter method testAvroFileSinkWriter.

@Test
public void testAvroFileSinkWriter() throws IOException {
    // 1. Create a Avro File Sink Writer
    DeltaInputWriter<GenericRecord> fileSinkWriter = new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema().toString(), 1024 * 1024L);
    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
    // 2. Generate 100 avro payloads and write them to an avro file
    IntStream.range(0, 100).forEach(a -> {
        try {
            fileSinkWriter.writeData(payloadGenerator.getNewPayload());
        } catch (IOException io) {
            throw new UncheckedIOException(io);
        }
    });
    fileSinkWriter.close();
    DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats();
    FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
    FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath()));
    // Atleast 1 file was written
    assertEquals(1, fileStatuses.length);
    // File length should be greater than 0
    assertTrue(fileStatuses[0].getLen() > 0);
    // File length should be the same as the number of bytes written
    assertTrue(deltaWriteStats.getBytesWritten() > 0);
    List<String> paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro"))).stream().map(f -> f.getPath().toString()).collect(Collectors.toList());
    JavaRDD<GenericRecord> writtenRecords = SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(), Option.empty());
    // Number of records written should be 100
    assertEquals(writtenRecords.count(), 100);
    // Number of records in file should match with the stats
    assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten());
}
Also used : Path(org.apache.hadoop.fs.Path) IntStream(java.util.stream.IntStream) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) GenericRecordFullPayloadGenerator(org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator) BeforeAll(org.junit.jupiter.api.BeforeAll) Path(org.apache.hadoop.fs.Path) AvroFileDeltaInputWriter(org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) GenericRecord(org.apache.avro.generic.GenericRecord) DeltaInputWriter(org.apache.hudi.integ.testsuite.writer.DeltaInputWriter) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) IOException(java.io.IOException) UtilitiesTestBase(org.apache.hudi.utilities.testutils.UtilitiesTestBase) Collectors(java.util.stream.Collectors) UncheckedIOException(java.io.UncheckedIOException) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) FilebasedSchemaProvider(org.apache.hudi.utilities.schema.FilebasedSchemaProvider) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkBasedReader(org.apache.hudi.integ.testsuite.reader.SparkBasedReader) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) GenericRecordFullPayloadGenerator(org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) AvroFileDeltaInputWriter(org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter) FileSystem(org.apache.hadoop.fs.FileSystem) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test)

Aggregations

IOException (java.io.IOException)2 UncheckedIOException (java.io.UncheckedIOException)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 GenericRecordFullPayloadGenerator (org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator)2 AvroFileDeltaInputWriter (org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter)2 DeltaWriteStats (org.apache.hudi.integ.testsuite.writer.DeltaWriteStats)2 Test (org.junit.jupiter.api.Test)2 Arrays (java.util.Arrays)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 IntStream (java.util.stream.IntStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 FSUtils (org.apache.hudi.common.fs.FSUtils)1 Option (org.apache.hudi.common.util.Option)1 SparkBasedReader (org.apache.hudi.integ.testsuite.reader.SparkBasedReader)1 DeltaInputWriter (org.apache.hudi.integ.testsuite.writer.DeltaInputWriter)1 FilebasedSchemaProvider (org.apache.hudi.utilities.schema.FilebasedSchemaProvider)1 UtilitiesTestBase (org.apache.hudi.utilities.testutils.UtilitiesTestBase)1