Search in sources :

Example 1 with KVTableStatePersistor

use of co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor in project cdap by caskdata.

the class DataCleansingMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS, new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));
    // Each run writes its output to a partition for the league
    Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
    PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();
    Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");
    // set up two outputs - one for invalid records and one for valid records
    Map<String, String> invalidRecordsArgs = new HashMap<>();
    PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
    PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
    context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));
    Map<String, String> cleanRecordsArgs = new HashMap<>();
    PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
    PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
    context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));
    Job job = context.getHadoopJob();
    job.setMapperClass(SchemaMatchingFilter.class);
    job.setNumReduceTasks(0);
    // simply propagate the schema (if any) to be used by the mapper
    String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
    if (schemaJson != null) {
        job.getConfiguration().set(SCHEMA_KEY, schemaJson);
    }
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) KVTableStatePersistor(co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)1 KVTableStatePersistor (co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor)1 MapReduceContext (co.cask.cdap.api.mapreduce.MapReduceContext)1 HashMap (java.util.HashMap)1 Job (org.apache.hadoop.mapreduce.Job)1