use of org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig in project hudi by apache.
the class DeltaGenerator method generateDeletes.
public JavaRDD<GenericRecord> generateDeletes(Config config) throws IOException {
if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
DeltaInputReader deltaInputReader = null;
JavaRDD<GenericRecord> adjustedRDD = null;
if (config.getNumDeletePartitions() < 1) {
// randomly generate deletes for a given number of records without regard to partitions and files
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr, ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
adjustedRDD = deltaInputReader.read(config.getNumRecordsDelete());
adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsDelete());
} else {
deltaInputReader = new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), schemaStr);
if (config.getFractionUpsertPerFile() > 0) {
adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), config.getFractionUpsertPerFile());
} else {
adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), config.getNumRecordsDelete());
}
}
log.info("Repartitioning records for delete");
// persist this since we will make multiple passes over this
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
Converter converter = new DeleteConverter(schemaStr, config.getRecordSize());
JavaRDD<GenericRecord> convertedRecords = converter.convert(adjustedRDD);
JavaRDD<GenericRecord> deletes = convertedRecords.map(record -> {
record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId);
return record;
});
deletes.persist(StorageLevel.DISK_ONLY());
return deletes;
} else {
throw new IllegalArgumentException("Other formats are not supported at the moment");
}
}
use of org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig in project hudi by apache.
the class WriterContext method initContext.
public void initContext(JavaSparkContext jsc) throws HoodieException {
try {
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc);
String schemaStr = schemaProvider.getSourceSchema().toString();
this.hoodieTestSuiteWriter = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr);
int inputParallelism = cfg.inputParallelism > 0 ? cfg.inputParallelism : jsc.defaultParallelism();
this.deltaGenerator = new DeltaGenerator(new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName), new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath, schemaStr, cfg.limitFileSize, inputParallelism, cfg.deleteOldInput), jsc, sparkSession, schemaStr, keyGenerator);
log.info(String.format("Initialized writerContext with: %s", schemaStr));
} catch (Exception e) {
throw new HoodieException("Failed to reinitialize writerContext", e);
}
}
use of org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig in project hudi by apache.
the class DeltaWriterFactory method getDeltaWriterAdapter.
public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integer batchId) throws IOException {
switch(config.getDeltaOutputMode()) {
case DFS:
switch(config.getDeltaInputType()) {
case AVRO:
DFSDeltaConfig dfsDeltaConfig = (DFSDeltaConfig) config;
dfsDeltaConfig.setBatchId(batchId);
DeltaInputWriter<GenericRecord> fileDeltaInputGenerator = new AvroFileDeltaInputWriter(dfsDeltaConfig.getConfiguration(), StringUtils.join(new String[] { dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString() }, "/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize());
return new DFSDeltaWriterAdapter(fileDeltaInputGenerator, batchId);
default:
throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType());
}
default:
throw new IllegalArgumentException("Invalid delta input type " + config.getDeltaOutputMode());
}
}
use of org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig in project hudi by apache.
the class TestDFSHoodieTestSuiteWriterAdapter method testDFSWorkloadSinkWithMultipleFilesFunctional.
@Test
public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO, new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath, schemaProvider.getSourceSchema().toString(), 10240L, jsc.defaultParallelism(), false);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory.getDeltaWriterAdapter(dfsSinkConfig, 1);
FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000, schemaProvider.getSourceSchema().toString());
dfsDeltaWriterAdapter.write(itr);
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
// Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
// 1 file
assertTrue(fileStatuses.length > 0);
}
use of org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig in project hudi by apache.
the class DeltaGenerator method generateUpdates.
public JavaRDD<GenericRecord> generateUpdates(Config config) throws IOException {
if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
JavaRDD<GenericRecord> inserts = null;
if (config.getNumRecordsInsert() > 0) {
inserts = generateInserts(config);
}
DeltaInputReader deltaInputReader = null;
JavaRDD<GenericRecord> adjustedRDD = null;
if (config.getNumUpsertPartitions() != 0) {
if (config.getNumUpsertPartitions() < 0) {
// randomly generate updates for a given number of records without regard to partitions and files
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr, ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert());
adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert());
} else {
deltaInputReader = new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), schemaStr);
if (config.getFractionUpsertPerFile() > 0) {
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config.getFractionUpsertPerFile());
} else {
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config.getNumRecordsUpsert());
}
}
// persist this since we will make multiple passes over this
int numPartition = Math.min(deltaOutputConfig.getInputParallelism(), Math.max(1, config.getNumUpsertPartitions()));
log.info("Repartitioning records into " + numPartition + " partitions for updates");
adjustedRDD = adjustedRDD.repartition(numPartition);
log.info("Repartitioning records done for updates");
UpdateConverter converter = new UpdateConverter(schemaStr, config.getRecordSize(), partitionPathFieldNames, recordRowKeyFieldNames);
JavaRDD<GenericRecord> convertedRecords = converter.convert(adjustedRDD);
JavaRDD<GenericRecord> updates = convertedRecords.map(record -> {
record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId);
return record;
});
updates.persist(StorageLevel.DISK_ONLY());
if (inserts == null) {
inserts = updates;
} else {
inserts = inserts.union(updates);
}
}
return inserts;
// TODO : Generate updates for only N partitions.
} else {
throw new IllegalArgumentException("Other formats are not supported at the moment");
}
}
Aggregations