use of co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor in project cdap by caskdata.
the class DataCleansingMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS, new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));
// Each run writes its output to a partition for the league
Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();
Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");
// set up two outputs - one for invalid records and one for valid records
Map<String, String> invalidRecordsArgs = new HashMap<>();
PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));
Map<String, String> cleanRecordsArgs = new HashMap<>();
PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));
Job job = context.getHadoopJob();
job.setMapperClass(SchemaMatchingFilter.class);
job.setNumReduceTasks(0);
// simply propagate the schema (if any) to be used by the mapper
String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
if (schemaJson != null) {
job.getConfiguration().set(SCHEMA_KEY, schemaJson);
}
}
Aggregations