Search in sources :

Example 11 with AvroKey

use of org.apache.avro.mapred.AvroKey in project spark-dataflow by cloudera.

the class TransformTranslator method readAvro.

private static <T> TransformEvaluator<AvroIO.Read.Bound<T>> readAvro() {
    return new TransformEvaluator<AvroIO.Read.Bound<T>>() {

        @Override
        public void evaluate(AvroIO.Read.Bound<T> transform, EvaluationContext context) {
            String pattern = transform.getFilepattern();
            JavaSparkContext jsc = context.getSparkContext();
            @SuppressWarnings("unchecked") JavaRDD<AvroKey<T>> avroFile = (JavaRDD<AvroKey<T>>) (JavaRDD<?>) jsc.newAPIHadoopFile(pattern, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, new Configuration()).keys();
            JavaRDD<WindowedValue<T>> rdd = avroFile.map(new Function<AvroKey<T>, T>() {

                @Override
                public T call(AvroKey<T> key) {
                    return key.datum();
                }
            }).map(WindowingHelpers.<T>windowFunction());
            context.setOutputRDD(transform, rdd);
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) AvroKey(org.apache.avro.mapred.AvroKey) NullWritable(org.apache.hadoop.io.NullWritable) JavaRDD(org.apache.spark.api.java.JavaRDD) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AvroKeyInputFormat(org.apache.avro.mapreduce.AvroKeyInputFormat)

Example 12 with AvroKey

use of org.apache.avro.mapred.AvroKey in project incubator-gobblin by apache.

the class AvroKeyDedupReducerTest method testReduce.

@Test
public void testReduce() throws IOException, InterruptedException {
    Schema keySchema = new Schema.Parser().parse(KEY_SCHEMA);
    GenericRecordBuilder keyRecordBuilder = new GenericRecordBuilder(keySchema.getField("key").schema());
    keyRecordBuilder.set("partitionKey", 1);
    keyRecordBuilder.set("environment", "test");
    keyRecordBuilder.set("subKey", "2");
    GenericRecord record = keyRecordBuilder.build();
    keyRecordBuilder = new GenericRecordBuilder(keySchema);
    keyRecordBuilder.set("key", record);
    GenericRecord keyRecord = keyRecordBuilder.build();
    // Test reducer with delta field "scn"
    Schema fullSchema = new Schema.Parser().parse(FULL_SCHEMA);
    AvroValue<GenericRecord> fullRecord1 = new AvroValue<>();
    AvroValue<GenericRecord> fullRecord2 = new AvroValue<>();
    AvroValue<GenericRecord> fullRecord3 = new AvroValue<>();
    AvroValue<GenericRecord> fullRecord4 = new AvroValue<>();
    GenericRecordBuilder fullRecordBuilder1 = new GenericRecordBuilder(fullSchema);
    fullRecordBuilder1.set("key", record);
    fullRecordBuilder1.set("scn", 123);
    fullRecordBuilder1.set("scn2", 100);
    fullRecord1.datum(fullRecordBuilder1.build());
    fullRecordBuilder1.set("scn", 125);
    fullRecordBuilder1.set("scn2", 1);
    fullRecord2.datum(fullRecordBuilder1.build());
    fullRecordBuilder1.set("scn", 124);
    fullRecordBuilder1.set("scn2", 10);
    fullRecord3.datum(fullRecordBuilder1.build());
    fullRecordBuilder1.set("scn", 122);
    fullRecordBuilder1.set("scn2", 1000);
    fullRecord4.datum(fullRecordBuilder1.build());
    Configuration conf = mock(Configuration.class);
    when(conf.get(AvroKeyDedupReducer.DELTA_SCHEMA_PROVIDER)).thenReturn(FieldAttributeBasedDeltaFieldsProvider.class.getName());
    when(conf.get(FieldAttributeBasedDeltaFieldsProvider.ATTRIBUTE_FIELD)).thenReturn("attributes_json");
    when(conf.get(FieldAttributeBasedDeltaFieldsProvider.DELTA_PROP_NAME, FieldAttributeBasedDeltaFieldsProvider.DEFAULT_DELTA_PROP_NAME)).thenReturn(FieldAttributeBasedDeltaFieldsProvider.DEFAULT_DELTA_PROP_NAME);
    AvroKeyDedupReducer reducer = new AvroKeyDedupReducer();
    WrappedReducer.Context reducerContext = mock(WrappedReducer.Context.class);
    when(reducerContext.getConfiguration()).thenReturn(conf);
    Counter moreThan1Counter = new GenericCounter();
    when(reducerContext.getCounter(AvroKeyDedupReducer.EVENT_COUNTER.MORE_THAN_1)).thenReturn(moreThan1Counter);
    Counter dedupedCounter = new GenericCounter();
    when(reducerContext.getCounter(AvroKeyDedupReducer.EVENT_COUNTER.DEDUPED)).thenReturn(dedupedCounter);
    Counter recordCounter = new GenericCounter();
    when(reducerContext.getCounter(AvroKeyDedupReducer.EVENT_COUNTER.RECORD_COUNT)).thenReturn(recordCounter);
    reducer.setup(reducerContext);
    doNothing().when(reducerContext).write(any(AvroKey.class), any(NullWritable.class));
    List<AvroValue<GenericRecord>> valueIterable = Lists.newArrayList(fullRecord1, fullRecord2, fullRecord3, fullRecord4);
    AvroKey<GenericRecord> key = new AvroKey<>();
    key.datum(keyRecord);
    reducer.reduce(key, valueIterable, reducerContext);
    Assert.assertEquals(reducer.getOutKey().datum(), fullRecord2.datum());
    // Test reducer without delta field
    Configuration conf2 = mock(Configuration.class);
    when(conf2.get(AvroKeyDedupReducer.DELTA_SCHEMA_PROVIDER)).thenReturn(null);
    when(reducerContext.getConfiguration()).thenReturn(conf2);
    AvroKeyDedupReducer reducer2 = new AvroKeyDedupReducer();
    reducer2.setup(reducerContext);
    reducer2.reduce(key, valueIterable, reducerContext);
    Assert.assertEquals(reducer2.getOutKey().datum(), fullRecord1.datum());
    // Test reducer with compound delta key.
    Schema fullSchema2 = new Schema.Parser().parse(FULL_SCHEMA_WITH_TWO_DELTA_FIELDS);
    GenericRecordBuilder fullRecordBuilder2 = new GenericRecordBuilder(fullSchema2);
    fullRecordBuilder2.set("key", record);
    fullRecordBuilder2.set("scn", 123);
    fullRecordBuilder2.set("scn2", 100);
    fullRecord1.datum(fullRecordBuilder2.build());
    fullRecordBuilder2.set("scn", 125);
    fullRecordBuilder2.set("scn2", 1000);
    fullRecord2.datum(fullRecordBuilder2.build());
    fullRecordBuilder2.set("scn", 126);
    fullRecordBuilder2.set("scn2", 1000);
    fullRecord3.datum(fullRecordBuilder2.build());
    fullRecordBuilder2.set("scn", 130);
    fullRecordBuilder2.set("scn2", 100);
    fullRecord4.datum(fullRecordBuilder2.build());
    List<AvroValue<GenericRecord>> valueIterable2 = Lists.newArrayList(fullRecord1, fullRecord2, fullRecord3, fullRecord4);
    reducer.reduce(key, valueIterable2, reducerContext);
    Assert.assertEquals(reducer.getOutKey().datum(), fullRecord3.datum());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) AvroKey(org.apache.avro.mapred.AvroKey) GenericCounter(org.apache.hadoop.mapreduce.counters.GenericCounter) NullWritable(org.apache.hadoop.io.NullWritable) GenericCounter(org.apache.hadoop.mapreduce.counters.GenericCounter) Counter(org.apache.hadoop.mapreduce.Counter) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) AvroValue(org.apache.avro.mapred.AvroValue) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.testng.annotations.Test)

Aggregations

AvroKey (org.apache.avro.mapred.AvroKey)12 NullWritable (org.apache.hadoop.io.NullWritable)7 Test (org.junit.Test)7 GenericRecord (org.apache.avro.generic.GenericRecord)5 IndexedRecord (org.apache.avro.generic.IndexedRecord)4 AvroValue (org.apache.avro.mapred.AvroValue)4 Pair (org.apache.hadoop.mrunit.types.Pair)4 Configuration (org.apache.hadoop.conf.Configuration)3 AvroIO (com.google.cloud.dataflow.sdk.io.AvroIO)2 WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)2 Schema (org.apache.avro.Schema)2 BytesWritable (org.apache.hadoop.io.BytesWritable)2 CannotProvideCoderException (com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Set (java.util.Set)1 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)1 AvroJob (org.apache.avro.mapreduce.AvroJob)1