use of org.apache.hudi.integ.testsuite.converter.DeleteConverter in project hudi by apache.
the class DeltaGenerator method generateDeletes.
public JavaRDD<GenericRecord> generateDeletes(Config config) throws IOException {
if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
DeltaInputReader deltaInputReader = null;
JavaRDD<GenericRecord> adjustedRDD = null;
if (config.getNumDeletePartitions() < 1) {
// randomly generate deletes for a given number of records without regard to partitions and files
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr, ((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
adjustedRDD = deltaInputReader.read(config.getNumRecordsDelete());
adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsDelete());
} else {
deltaInputReader = new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(), schemaStr);
if (config.getFractionUpsertPerFile() > 0) {
adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), config.getFractionUpsertPerFile());
} else {
adjustedRDD = deltaInputReader.read(config.getNumDeletePartitions(), config.getNumUpsertFiles(), config.getNumRecordsDelete());
}
}
log.info("Repartitioning records for delete");
// persist this since we will make multiple passes over this
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
Converter converter = new DeleteConverter(schemaStr, config.getRecordSize());
JavaRDD<GenericRecord> convertedRecords = converter.convert(adjustedRDD);
JavaRDD<GenericRecord> deletes = convertedRecords.map(record -> {
record.put(SchemaUtils.SOURCE_ORDERING_FIELD, batchId);
return record;
});
deletes.persist(StorageLevel.DISK_ONLY());
return deletes;
} else {
throw new IllegalArgumentException("Other formats are not supported at the moment");
}
}
Aggregations