use of scala.Tuple2 in project cdap by caskdata.
the class ClassicSparkProgram method main.
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
List<StructuredRecord> records = new ArrayList<>();
for (int i = 1; i <= 10; i++) {
records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
}
// This test serialization of StructuredRecord as well as using custom kryo serializer
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {
@Override
public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
}
}).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {
@Override
public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
return tuple._1;
}
}).reduce(new Function2<MyInt, MyInt, MyInt>() {
@Override
public MyInt call(MyInt v1, MyInt v2) throws Exception {
return new MyInt(v1.toInt() + v2.toInt());
}
}).toInt();
if (result != 55) {
throw new Exception("Expected result to be 55");
}
}
use of scala.Tuple2 in project cdap by caskdata.
the class SparkAppUsingGetDataset method parse.
@Nullable
static Tuple2<LogKey, LogStats> parse(Text log) {
Matcher matcher = CLF_LOG_PATTERN.matcher(log.toString());
if (matcher.find()) {
String ip = matcher.group(1);
String user = matcher.group(3);
String request = matcher.group(5);
int code = Integer.parseInt(matcher.group(6));
int size = Integer.parseInt(matcher.group(7));
return new Tuple2<>(new LogKey(ip, user, request, code), new LogStats(1, size));
}
return null;
}
use of scala.Tuple2 in project cdap by caskdata.
the class SparkLogParser method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> runtimeArguments = sec.getRuntimeArguments();
String inputFileSet = runtimeArguments.get("input");
final String outputTable = runtimeArguments.get("output");
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {
@Override
public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
return SparkAppUsingGetDataset.parse(input._2());
}
}).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {
@Override
public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
return stats1.aggregate(stats2);
}
}).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {
@Override
public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
final Gson gson = new Gson();
return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
}
}));
}
});
// Collect all data to driver and write to dataset directly. That's the intend of the test.
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable kvTable = context.getDataset(outputTable);
for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
kvTable.write(entry.getKey(), entry.getValue());
}
}
});
}
use of scala.Tuple2 in project spark-dataflow by cloudera.
the class TransformTranslator method readHadoop.
private static <K, V> TransformEvaluator<HadoopIO.Read.Bound<K, V>> readHadoop() {
return new TransformEvaluator<HadoopIO.Read.Bound<K, V>>() {
@Override
public void evaluate(HadoopIO.Read.Bound<K, V> transform, EvaluationContext context) {
String pattern = transform.getFilepattern();
JavaSparkContext jsc = context.getSparkContext();
@SuppressWarnings("unchecked") JavaPairRDD<K, V> file = jsc.newAPIHadoopFile(pattern, transform.getFormatClass(), transform.getKeyClass(), transform.getValueClass(), new Configuration());
JavaRDD<WindowedValue<KV<K, V>>> rdd = file.map(new Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
return KV.of(t2._1(), t2._2());
}
}).map(WindowingHelpers.<KV<K, V>>windowFunction());
context.setOutputRDD(transform, rdd);
}
};
}
use of scala.Tuple2 in project spark-dataflow by cloudera.
the class TransformTranslator method writeHadoop.
private static <K, V> TransformEvaluator<HadoopIO.Write.Bound<K, V>> writeHadoop() {
return new TransformEvaluator<HadoopIO.Write.Bound<K, V>>() {
@Override
public void evaluate(HadoopIO.Write.Bound<K, V> transform, EvaluationContext context) {
@SuppressWarnings("unchecked") JavaPairRDD<K, V> last = ((JavaRDDLike<WindowedValue<KV<K, V>>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<KV<K, V>>unwindowFunction()).mapToPair(new PairFunction<KV<K, V>, K, V>() {
@Override
public Tuple2<K, V> call(KV<K, V> t) throws Exception {
return new Tuple2<>(t.getKey(), t.getValue());
}
});
ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
Configuration conf = new Configuration();
for (Map.Entry<String, String> e : transform.getConfigurationProperties().entrySet()) {
conf.set(e.getKey(), e.getValue());
}
writeHadoopFile(last, conf, shardTemplateInfo, transform.getKeyClass(), transform.getValueClass(), transform.getFormatClass());
}
};
}
Aggregations