use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.
the class TopkPhaseTest method testTopKColumnTransformationPhase.
@Test
public void testTopKColumnTransformationPhase() throws Exception {
int recordCount = 0;
List<GenericRecord> inputRecords = generateTestMapperData();
for (GenericRecord record : inputRecords) {
AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
inKey.datum(record);
mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
recordCount++;
}
List<Pair<BytesWritable, BytesWritable>> result = mapDriver.run();
// for each record, we emit 2 records per dimension:
// once for actual value of dimension, once for ALL,ALL
Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount * 3 * 2, result.size());
Map<String, Integer> counts = new HashMap<>();
for (Pair<BytesWritable, BytesWritable> pair : result) {
TopKPhaseMapOutputKey key = TopKPhaseMapOutputKey.fromBytes(pair.getFirst().getBytes());
String dimensionName = key.getDimensionName();
Integer count = counts.get(dimensionName);
if (count == null) {
count = 0;
}
counts.put(dimensionName, count + 1);
}
Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d1"));
Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d2"));
Assert.assertEquals("Incorrect number of records emitted from map", 2, (int) counts.get("d3"));
Assert.assertEquals("Incorrect number of records emitted from map", 6, (int) counts.get("0"));
List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(result);
reduceDriver.addAll(reduceInput);
reduceDriver.run();
File topKFile = new File(outputPath, ThirdEyeConstants.TOPK_VALUES_FILE);
Assert.assertTrue("Topk file failed to generate!", topKFile.exists());
TopKDimensionValues topk = OBJECT_MAPPER.readValue(new FileInputStream(topKFile), TopKDimensionValues.class);
Map<String, Set<String>> topkMap = topk.getTopKDimensions();
Assert.assertEquals("Incorrect topk object", topkMap.size(), 1);
Assert.assertEquals("Incorrect topk values in topk object", Sets.newHashSet("pqr1"), topkMap.get("d2"));
Assert.assertEquals("Incorrect whitelist values in topk object", null, topkMap.get("d3"));
}
use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.
the class DerivedColumnNoTransformationTest method testTopKColumnTransformationPhase.
@Test
public void testTopKColumnTransformationPhase() throws Exception {
int recordCount = 0;
List<GenericRecord> inputRecords = generateTestData();
for (GenericRecord record : inputRecords) {
AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
inKey.datum(record);
mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
recordCount++;
}
resetAvroSerialization();
List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
Assert.assertEquals(recordCount, result.size());
for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
GenericRecord datum = pair.getFirst().datum();
System.out.println(datum.getSchema().getFields().size());
Assert.assertEquals("Input records must contain same number of fields as output record, when schemas are not transformed", datum.getSchema().getFields().size(), 6);
}
}
use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.
the class DerivedColumnTransformationTest method testTopKColumnTransformationPhase.
@Test
public void testTopKColumnTransformationPhase() throws Exception {
int recordCount = 0;
List<GenericRecord> inputRecords = generateTestData();
for (GenericRecord record : inputRecords) {
AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
inKey.datum(record);
mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
recordCount++;
}
resetAvroSerialization();
List<Pair<AvroKey<GenericRecord>, NullWritable>> result = mapDriver.run();
Assert.assertEquals(recordCount, result.size());
for (Pair<AvroKey<GenericRecord>, NullWritable> pair : result) {
GenericRecord datum = pair.getFirst().datum();
Assert.assertEquals("TopKTransformationJob did not add new column for topk column", datum.getSchema().getField("d2_topk") != null, true);
String d2 = (String) datum.get("d2");
String d2_topk = (String) datum.get("d2_topk");
Assert.assertEquals("Incorrect topk column transformation", (d2_topk.equals("other") && d2.equals("pqr1")) || (d2_topk.equals("pqr2") && d2.equals("pqr2")), true);
}
}
use of org.apache.avro.mapred.AvroKey in project pinot by linkedin.
the class AggregationPhaseTest method testAggregationPhase.
@Test
public void testAggregationPhase() throws Exception {
int recordCount = 0;
List<GenericRecord> inputRecords = generateTestMapperData();
for (GenericRecord record : inputRecords) {
AvroKey<GenericRecord> inKey = new AvroKey<GenericRecord>();
inKey.datum(record);
mapDriver.addInput(new Pair<AvroKey<GenericRecord>, NullWritable>(inKey, NullWritable.get()));
recordCount++;
}
List<Pair<BytesWritable, BytesWritable>> mapResult = mapDriver.run();
Assert.assertEquals("Incorrect number of records emitted by mapper", recordCount, mapResult.size());
AggregationPhaseMapOutputKey keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(0).getFirst().getBytes());
Assert.assertEquals(406058, keyWrapper.getTime());
keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(1).getFirst().getBytes());
Assert.assertEquals(406058, keyWrapper.getTime());
keyWrapper = AggregationPhaseMapOutputKey.fromBytes(mapResult.get(2).getFirst().getBytes());
Assert.assertEquals(406059, keyWrapper.getTime());
List<Pair<BytesWritable, List<BytesWritable>>> reduceInput = generateTestReduceData(mapResult);
reduceDriver.addAll(reduceInput);
List<Pair<AvroKey<GenericRecord>, NullWritable>> reduceResult = reduceDriver.run();
Assert.assertEquals("Incorrect number of records returned by aggregation reducer", 2, reduceResult.size());
GenericRecord record = reduceResult.get(0).getFirst().datum();
List<String> dimensionsExpected = Lists.newArrayList("abc1", "pqr1", "xyz1");
List<String> dimensionsActual = getDimensionsFromRecord(record);
Assert.assertEquals(dimensionsExpected, dimensionsActual);
List<Integer> metricsExpected = Lists.newArrayList(200, 40);
List<Integer> metricsActual = getMetricsFromRecord(record);
Assert.assertEquals(metricsExpected, metricsActual);
Assert.assertEquals(406058, (long) record.get("hoursSinceEpoch"));
record = reduceResult.get(1).getFirst().datum();
dimensionsExpected = Lists.newArrayList("abc2", "pqr2", "xyz2");
dimensionsActual = getDimensionsFromRecord(record);
Assert.assertEquals(dimensionsExpected, dimensionsActual);
metricsExpected = Lists.newArrayList(10, 2);
metricsActual = getMetricsFromRecord(record);
Assert.assertEquals(metricsExpected, metricsActual);
Assert.assertEquals(406059, (long) record.get("hoursSinceEpoch"));
}
use of org.apache.avro.mapred.AvroKey in project spark-dataflow by cloudera.
the class TransformTranslator method writeAvro.
private static <T> TransformEvaluator<AvroIO.Write.Bound<T>> writeAvro() {
return new TransformEvaluator<AvroIO.Write.Bound<T>>() {
@Override
public void evaluate(AvroIO.Write.Bound<T> transform, EvaluationContext context) {
Job job;
try {
job = Job.getInstance();
} catch (IOException e) {
throw new IllegalStateException(e);
}
AvroJob.setOutputKeySchema(job, transform.getSchema());
@SuppressWarnings("unchecked") JavaPairRDD<AvroKey<T>, NullWritable> last = ((JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction()).mapToPair(new PairFunction<T, AvroKey<T>, NullWritable>() {
@Override
public Tuple2<AvroKey<T>, NullWritable> call(T t) throws Exception {
return new Tuple2<>(new AvroKey<>(t), NullWritable.get());
}
});
ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
writeHadoopFile(last, job.getConfiguration(), shardTemplateInfo, AvroKey.class, NullWritable.class, TemplatedAvroKeyOutputFormat.class);
}
};
}
Aggregations