Search in sources :

Example 1 with DeltaWriteStats

use of org.apache.hudi.integ.testsuite.writer.DeltaWriteStats in project hudi by apache.

the class DeltaGenerator method writeRecords.

public JavaRDD<DeltaWriteStats> writeRecords(JavaRDD<GenericRecord> records) {
    if (deltaOutputConfig.shouldDeleteOldInputData() && batchId > 1) {
        Path oldInputDir = new Path(deltaOutputConfig.getDeltaBasePath(), Integer.toString(batchId - 1));
        try {
            FileSystem fs = FSUtils.getFs(oldInputDir.toString(), deltaOutputConfig.getConfiguration());
            fs.delete(oldInputDir, true);
        } catch (IOException e) {
            log.error("Failed to delete older input data direcory " + oldInputDir, e);
        }
    }
    // The following creates a new anonymous function for iterator and hence results in serialization issues
    JavaRDD<DeltaWriteStats> ws = records.mapPartitions(itr -> {
        try {
            DeltaWriterAdapter<GenericRecord> deltaWriterAdapter = DeltaWriterFactory.getDeltaWriterAdapter(deltaOutputConfig, batchId);
            return Collections.singletonList(deltaWriterAdapter.write(itr)).iterator();
        } catch (IOException io) {
            throw new UncheckedIOException(io);
        }
    }).flatMap(List::iterator);
    batchId++;
    return ws;
}
Also used : Path(org.apache.hadoop.fs.Path) IntStream(java.util.stream.IntStream) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) DeltaWriterFactory(org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Option(org.apache.hudi.common.util.Option) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator) UpdateConverter(org.apache.hudi.integ.testsuite.converter.UpdateConverter) ArrayList(java.util.ArrayList) Converter(org.apache.hudi.integ.testsuite.converter.Converter) DFSHoodieDatasetInputReader(org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader) SchemaUtils(org.apache.hudi.integ.testsuite.schema.SchemaUtils) StorageLevel(org.apache.spark.storage.StorageLevel) Config(org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config) Map(java.util.Map) DFSAvroDeltaInputReader(org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader) Path(org.apache.hadoop.fs.Path) StreamSupport(java.util.stream.StreamSupport) DeltaOutputMode(org.apache.hudi.integ.testsuite.writer.DeltaOutputMode) DeltaInputReader(org.apache.hudi.integ.testsuite.reader.DeltaInputReader) DeltaWriterAdapter(org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter) JavaRDD(org.apache.spark.api.java.JavaRDD) DFSDeltaConfig(org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig) SparkSession(org.apache.spark.sql.SparkSession) GenericRecord(org.apache.avro.generic.GenericRecord) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) IOException(java.io.IOException) DeleteConverter(org.apache.hudi.integ.testsuite.converter.DeleteConverter) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) UncheckedIOException(java.io.UncheckedIOException) List(java.util.List) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) DeltaWriterAdapter(org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter) FileSystem(org.apache.hadoop.fs.FileSystem) UncheckedIOException(java.io.UncheckedIOException) ArrayList(java.util.ArrayList) List(java.util.List) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats)

Example 2 with DeltaWriteStats

use of org.apache.hudi.integ.testsuite.writer.DeltaWriteStats in project hudi by apache.

the class TestDFSHoodieTestSuiteWriterAdapter method testDFSTwoFilesWriteWithRollover.

@Test
public void testDFSTwoFilesWriteWithRollover() throws IOException {
    DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
    DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
    when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
    when(mockFileSinkWriter.canWrite()).thenReturn(false, true);
    when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
    DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
    Iterator<GenericRecord> mockIterator = Mockito.mock(Iterator.class);
    when(mockIterator.hasNext()).thenReturn(true, true, true, false);
    dfsDeltaWriterAdapter.write(mockIterator);
    Mockito.verify(mockFileSinkWriter, times(2)).canWrite();
    Mockito.verify(mockFileSinkWriter, times(1)).getNewWriter();
    Mockito.verify(mockFileSinkWriter, times(2)).close();
}
Also used : DFSDeltaWriterAdapter(org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter) GenericRecord(org.apache.avro.generic.GenericRecord) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) Test(org.junit.jupiter.api.Test)

Example 3 with DeltaWriteStats

use of org.apache.hudi.integ.testsuite.writer.DeltaWriteStats in project hudi by apache.

the class HoodieTestSuiteWriter method commitCompaction.

public void commitCompaction(JavaRDD<WriteStatus> records, JavaRDD<DeltaWriteStats> generatedDataStats, Option<String> instantTime) throws IOException {
    if (!cfg.useDeltaStreamer) {
        Map<String, String> extraMetadata = new HashMap<>();
        /**
         * Store the checkpoint in the commit metadata just like
         * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} *
         */
        extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
        if (generatedDataStats != null && generatedDataStats.count() > 1) {
            // Just stores the path where this batch of data is generated to
            extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0));
        }
        HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext());
        HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema());
        writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata));
    }
}
Also used : Arrays(java.util.Arrays) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HashSet(java.util.HashSet) DagNode(org.apache.hudi.integ.testsuite.dag.nodes.DagNode) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestSuiteConfig(org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig) CompactHelpers(org.apache.hudi.table.action.compact.CompactHelpers) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) RollbackNode(org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) CleanNode(org.apache.hudi.integ.testsuite.dag.nodes.CleanNode) ScheduleCompactNode(org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) RDD(org.apache.spark.rdd.RDD) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HashMap(java.util.HashMap) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 4 with DeltaWriteStats

use of org.apache.hudi.integ.testsuite.writer.DeltaWriteStats in project hudi by apache.

the class TestDFSHoodieTestSuiteWriterAdapter method testDFSOneFileWrite.

@Test
public void testDFSOneFileWrite() throws IOException {
    DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
    DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
    when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
    when(mockFileSinkWriter.canWrite()).thenReturn(true);
    when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
    DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
    JavaRDD<GenericRecord> records = TestUtils.makeRDD(jsc, 10);
    dfsDeltaWriterAdapter.write(records.collect().iterator());
    Mockito.verify(mockFileSinkWriter, times(10)).canWrite();
    Mockito.verify(mockFileSinkWriter, times(1)).close();
}
Also used : DFSDeltaWriterAdapter(org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter) GenericRecord(org.apache.avro.generic.GenericRecord) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) Test(org.junit.jupiter.api.Test)

Example 5 with DeltaWriteStats

use of org.apache.hudi.integ.testsuite.writer.DeltaWriteStats in project hudi by apache.

the class TestFileDeltaInputWriter method testAvroFileSinkCreateNewWriter.

@Test
public void testAvroFileSinkCreateNewWriter() throws IOException {
    // 1. Create a Avro File Sink Writer
    DeltaInputWriter<GenericRecord> fileSinkWriter = new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath, schemaProvider.getSourceSchema().toString(), 1024 * 1024L);
    GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
    // 2. Generate 100 avro payloads and write them to an avro file
    IntStream.range(0, 100).forEach(a -> {
        try {
            fileSinkWriter.writeData(payloadGenerator.getNewPayload());
        } catch (IOException io) {
            throw new UncheckedIOException(io);
        }
    });
    fileSinkWriter.close();
    String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath();
    assertFalse(oldFilePath == null);
    DeltaInputWriter<GenericRecord> newFileSinkWriter = fileSinkWriter.getNewWriter();
    newFileSinkWriter.close();
    DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats();
    assertEquals(newStats.getBytesWritten(), 3674);
    assertEquals(newStats.getRecordsWritten(), 0);
    assertTrue(newStats.getFilePath() != null);
}
Also used : AvroFileDeltaInputWriter(org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter) GenericRecordFullPayloadGenerator(org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) GenericRecord(org.apache.avro.generic.GenericRecord) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) Test(org.junit.jupiter.api.Test)

Aggregations

GenericRecord (org.apache.avro.generic.GenericRecord)6 DeltaWriteStats (org.apache.hudi.integ.testsuite.writer.DeltaWriteStats)6 IOException (java.io.IOException)4 Test (org.junit.jupiter.api.Test)4 UncheckedIOException (java.io.UncheckedIOException)3 Arrays (java.util.Arrays)3 Option (org.apache.hudi.common.util.Option)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Serializable (java.io.Serializable)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 Collectors (java.util.stream.Collectors)2 IntStream (java.util.stream.IntStream)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 FSUtils (org.apache.hudi.common.fs.FSUtils)2 GenericRecordFullPayloadGenerator (org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator)2 AvroFileDeltaInputWriter (org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter)2 ArrayList (java.util.ArrayList)1