Search in sources :

Example 6 with DeltaWriteStats

use of org.apache.hudi.integ.testsuite.writer.DeltaWriteStats in project hudi by apache.

the class TestFileDeltaInputWriter method testAvroFileSinkWriter.

@Test
public void testAvroFileSinkWriter() throws IOException {
    // 1. Create a Avro File Sink Writer
    DeltaInputWriter<GenericRecord> fileSinkWriter = new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema().toString(), 1024 * 1024L);
    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
    // 2. Generate 100 avro payloads and write them to an avro file
    IntStream.range(0, 100).forEach(a -> {
        try {
            fileSinkWriter.writeData(payloadGenerator.getNewPayload());
        } catch (IOException io) {
            throw new UncheckedIOException(io);
        }
    });
    fileSinkWriter.close();
    DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats();
    FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
    FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath()));
    // Atleast 1 file was written
    assertEquals(1, fileStatuses.length);
    // File length should be greater than 0
    assertTrue(fileStatuses[0].getLen() > 0);
    // File length should be the same as the number of bytes written
    assertTrue(deltaWriteStats.getBytesWritten() > 0);
    List<String> paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro"))).stream().map(f -> f.getPath().toString()).collect(Collectors.toList());
    JavaRDD<GenericRecord> writtenRecords = SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(), Option.empty());
    // Number of records written should be 100
    assertEquals(writtenRecords.count(), 100);
    // Number of records in file should match with the stats
    assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten());
}
Also used : Path(org.apache.hadoop.fs.Path) IntStream(java.util.stream.IntStream) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) GenericRecordFullPayloadGenerator(org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator) BeforeAll(org.junit.jupiter.api.BeforeAll) Path(org.apache.hadoop.fs.Path) AvroFileDeltaInputWriter(org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) GenericRecord(org.apache.avro.generic.GenericRecord) DeltaInputWriter(org.apache.hudi.integ.testsuite.writer.DeltaInputWriter) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) IOException(java.io.IOException) UtilitiesTestBase(org.apache.hudi.utilities.testutils.UtilitiesTestBase) Collectors(java.util.stream.Collectors) UncheckedIOException(java.io.UncheckedIOException) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) FilebasedSchemaProvider(org.apache.hudi.utilities.schema.FilebasedSchemaProvider) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkBasedReader(org.apache.hudi.integ.testsuite.reader.SparkBasedReader) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) GenericRecordFullPayloadGenerator(org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) AvroFileDeltaInputWriter(org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter) FileSystem(org.apache.hadoop.fs.FileSystem) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test)

Aggregations

GenericRecord (org.apache.avro.generic.GenericRecord)6 DeltaWriteStats (org.apache.hudi.integ.testsuite.writer.DeltaWriteStats)6 IOException (java.io.IOException)4 Test (org.junit.jupiter.api.Test)4 UncheckedIOException (java.io.UncheckedIOException)3 Arrays (java.util.Arrays)3 Option (org.apache.hudi.common.util.Option)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Serializable (java.io.Serializable)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 IntStream (java.util.stream.IntStream)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 FSUtils (org.apache.hudi.common.fs.FSUtils)2 GenericRecordFullPayloadGenerator (org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator)2 AvroFileDeltaInputWriter (org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter)2 ArrayList (java.util.ArrayList)1