Search in sources :

Example 6 with AvroKey

use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.

the class TopkPhaseTest method testTopKColumnTransformationPhase.

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestMapperData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }
    List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run();
    // for each record, we emit 2 records per dimension:
    // once for actual value of dimension, once for ALL,ALL
    Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size());
    Map<String, Integer> counts = new HashMap<>();
    for (Pair<BytesWritable, BytesWritable> pair : result) {
        TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes());
        String dimensionName = key.getDimensionName();
        Integer count = counts.get(dimensionName);
        if (count == null) {
            count = 0;
        }
        counts.put(dimensionName, count + 1);
    }
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1"));
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2"));
    Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3"));
    Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0"));
    List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result);
    reduceDriver.addAll(reduceInput);
    reduceDriver.run();
    File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE);
    Assert.assertTrue("Topk file failed to generate!", topKFile.exists());
    TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile), TopKDimensionValues.class);
    Map<String, Set<String>> topkMap = topk.getTopKDimensions();
    Assert.assertEquals("Incorrect topk object", topkMap.size(), 1);
    Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2"));
    Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3"));
}
Also used : Set(java.util.Set) HashMap(java.util.HashMap) AvroKey(org.apache.avro.mapred.AvroKey) BytesWritable(org.apache.hadoop.io.BytesWritable) NullWritable(org.apache.hadoop.io.NullWritable) FileInputStream(java.io.FileInputStream) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File) Pair(org.apache.hadoop.mrunit.types.Pair) Test(org.junit.Test)

Example 7 with AvroKey

use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.

the class DerivedColumnNoTransformationTest method testTopKColumnTransformationPhase.

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }
    resetAvroSerialization();
    List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
    Assert.assertEquals(recordCount, result.size());
    for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
        GenericRecord datum = pair.getFirst().datum();
        System.out.println(datum.getSchema().getFields().size());
        Assert.assertEquals("Input records must contain same number of fields as output record, when schemas are not transformed", datum.getSchema().getFields().size(), 6);
    }
}
Also used : AvroKey(org.apache.avro.mapred.AvroKey) GenericRecord(org.apache.avro.generic.GenericRecord) NullWritable(org.apache.hadoop.io.NullWritable) Pair(org.apache.hadoop.mrunit.types.Pair) Test(org.junit.Test)

Example 8 with AvroKey

use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.

the class DerivedColumnTransformationTest method testTopKColumnTransformationPhase.

@Test
public void testTopKColumnTransformationPhase() throws Exception {
    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }
    resetAvroSerialization();
    List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
    Assert.assertEquals(recordCount, result.size());
    for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
        GenericRecord datum = pair.getFirst().datum();
        Assert.assertEquals("TopKTransformationJob did not add new column for topk column", datum.getSchema().getField("d2_topk") != null, true);
        String d2 = (String) datum.get("d2");
        String d2_topk = (String) datum.get("d2_topk");
        Assert.assertEquals("Incorrect topk column transformation", (d2_topk.equals("other") && d2.equals("pqr1")) || (d2_topk.equals("pqr2") && d2.equals("pqr2")), true);
    }
}
Also used : AvroKey(org.apache.avro.mapred.AvroKey) GenericRecord(org.apache.avro.generic.GenericRecord) NullWritable(org.apache.hadoop.io.NullWritable) Pair(org.apache.hadoop.mrunit.types.Pair) Test(org.junit.Test)

Example 9 with AvroKey

use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.

the class AggregationPhaseTest method testAggregationPhase.

@Test
public void testAggregationPhase() throws Exception {
    int recordCount = 0;
    List<GenericRecord> inputRecords = generateTestMapperData();
    for (GenericRecord record : inputRecords) {
        AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
        inKey.datum(record);
        mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
        recordCount++;
    }
    List<Pair<BytesWritable, BytesWritable>> mapResult = mapDriver.run();
    Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount, mapResult.size());
    AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(0).getFirst().getBytes());
    Assert.assertEquals(406058, keyWrapper.getTime());
    keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(1).getFirst().getBytes());
    Assert.assertEquals(406058, keyWrapper.getTime());
    keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(2).getFirst().getBytes());
    Assert.assertEquals(406059, keyWrapper.getTime());
    List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(mapResult);
    reduceDriver.addAll(reduceInput);
    List<Pair<AvroKey<GenericRecord>, NullWritable>> reduceResult = reduceDriver.run();
    Assert.assertEquals("Incorrect number of records returned by aggregation reducer", 2, reduceResult.size());
    GenericRecord record = reduceResult.get(0).getFirst().datum();
    List<String> dimensionsExpected = Lists.newArrayList("abc1", "pqr1", "xyz1");
    List<String> dimensionsActual = getDimensionsFromRecord(record);
    Assert.assertEquals(dimensionsExpected, dimensionsActual);
    List<Integer> metricsExpected = Lists.newArrayList(200, 40);
    List<Integer> metricsActual = getMetricsFromRecord(record);
    Assert.assertEquals(metricsExpected, metricsActual);
    Assert.assertEquals(406058, (long) record.get("hoursSinceEpoch"));
    record = reduceResult.get(1).getFirst().datum();
    dimensionsExpected = Lists.newArrayList("abc2", "pqr2", "xyz2");
    dimensionsActual = getDimensionsFromRecord(record);
    Assert.assertEquals(dimensionsExpected, dimensionsActual);
    metricsExpected = Lists.newArrayList(10, 2);
    metricsActual = getMetricsFromRecord(record);
    Assert.assertEquals(metricsExpected, metricsActual);
    Assert.assertEquals(406059, (long) record.get("hoursSinceEpoch"));
}
Also used : AvroKey(org.apache.avro.mapred.AvroKey) BytesWritable(org.apache.hadoop.io.BytesWritable) NullWritable(org.apache.hadoop.io.NullWritable) GenericRecord(org.apache.avro.generic.GenericRecord) Pair(org.apache.hadoop.mrunit.types.Pair) Test(org.junit.Test)

Example 10 with AvroKey

use of org.apache.avro.mapred.AvroKey in project spark-dataflow by cloudera.

the class TransformTranslator method writeAvro.

private static <T> TransformEvaluator<AvroIO.Write.Bound<T>> writeAvro() {
    return new TransformEvaluator<AvroIO.Write.Bound<T>>() {

        @Override
        public void evaluate(AvroIO.Write.Bound<T> transform, EvaluationContext context) {
            Job job;
            try {
                job = Job.getInstance();
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            AvroJob.setOutputKeySchema(job, transform.getSchema());
            @SuppressWarnings("unchecked") JavaPairRDD<AvroKey<T>, NullWritable> last = ((JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction()).mapToPair(new PairFunction<T, AvroKey<T>, NullWritable>() {

                @Override
                public Tuple2<AvroKey<T>, NullWritable> call(T t) throws Exception {
                    return new Tuple2<>(new AvroKey<>(t), NullWritable.get());
                }
            });
            ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
            writeHadoopFile(last, job.getConfiguration(), shardTemplateInfo, AvroKey.class, NullWritable.class, TemplatedAvroKeyOutputFormat.class);
        }
    };
}
Also used : AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) AvroKey(org.apache.avro.mapred.AvroKey) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) Tuple2(scala.Tuple2) Job(org.apache.hadoop.mapreduce.Job) AvroJob(org.apache.avro.mapreduce.AvroJob)

Aggregations

AvroKey (org.apache.avro.mapred.AvroKey)12 NullWritable (org.apache.hadoop.io.NullWritable)7 Test (org.junit.Test)7 GenericRecord (org.apache.avro.generic.GenericRecord)5 IndexedRecord (org.apache.avro.generic.IndexedRecord)4 AvroValue (org.apache.avro.mapred.AvroValue)4 Pair (org.apache.hadoop.mrunit.types.Pair)4 Configuration (org.apache.hadoop.conf.Configuration)3 AvroIO (com.google.cloud.dataflow.sdk.io.AvroIO)2 WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)2 Schema (org.apache.avro.Schema)2 BytesWritable (org.apache.hadoop.io.BytesWritable)2 CannotProvideCoderException (com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Set (java.util.Set)1 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)1 AvroJob (org.apache.avro.mapreduce.AvroJob)1