Search in sources :

Example 1 with JavaRDDLike

use of org.apache.spark.api.java.JavaRDDLike in project camel by apache.

the class SparkProducerTest method createRegistry.

// Routes fixtures
@Override
protected JndiRegistry createRegistry() throws Exception {
    JndiRegistry registry = super.createRegistry();
    registry.bind("testFileRdd", sparkContext.textFile("src/test/resources/testrdd.txt"));
    if (shouldRunHive) {
        registry.bind("hiveContext", hiveContext);
        DataFrame jsonCars = hiveContext.read().json("src/test/resources/cars.json");
        jsonCars.registerTempTable("cars");
        registry.bind("jsonCars", jsonCars);
    }
    registry.bind("countLinesTransformation", new org.apache.camel.component.spark.RddCallback() {

        @Override
        public Object onRdd(JavaRDDLike rdd, Object... payloads) {
            return rdd.count();
        }
    });
    return registry;
}
Also used : JndiRegistry(org.apache.camel.impl.JndiRegistry) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) DataFrame(org.apache.spark.sql.DataFrame)

Example 2 with JavaRDDLike

use of org.apache.spark.api.java.JavaRDDLike in project camel by apache.

the class RddSparkProducer method process.

@Override
public void process(Exchange exchange) throws Exception {
    JavaRDDLike rdd = resolveRdd(exchange);
    RddCallback rddCallback = resolveRddCallback(exchange);
    Object body = exchange.getIn().getBody();
    Object result = body instanceof List ? rddCallback.onRdd(rdd, ((List) body).toArray(new Object[0])) : rddCallback.onRdd(rdd, body);
    collectResults(exchange, result);
}
Also used : JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) List(java.util.List)

Example 3 with JavaRDDLike

use of org.apache.spark.api.java.JavaRDDLike in project spark-dataflow by cloudera.

the class TransformTranslator method multiDo.

private static <I, O> TransformEvaluator<ParDo.BoundMulti<I, O>> multiDo() {
    return new TransformEvaluator<ParDo.BoundMulti<I, O>>() {

        @Override
        public void evaluate(ParDo.BoundMulti<I, O> transform, EvaluationContext context) {
            TupleTag<O> mainOutputTag = MULTIDO_FG.get("mainOutputTag", transform);
            MultiDoFnFunction<I, O> multifn = new MultiDoFnFunction<>(transform.getFn(), context.getRuntimeContext(), mainOutputTag, getSideInputs(transform.getSideInputs(), context));
            @SuppressWarnings("unchecked") JavaRDDLike<WindowedValue<I>, ?> inRDD = (JavaRDDLike<WindowedValue<I>, ?>) context.getInputRDD(transform);
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all = inRDD.mapPartitionsToPair(multifn).cache();
            PCollectionTuple pct = context.getOutput(transform);
            for (Map.Entry<TupleTag<?>, PCollection<?>> e : pct.getAll().entrySet()) {
                @SuppressWarnings("unchecked") JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TupleTagFilter(e.getKey()));
                @SuppressWarnings("unchecked") JavaRDD<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.setRDD(e.getValue(), values);
            }
        }
    };
}
Also used : TupleTag(com.google.cloud.dataflow.sdk.values.TupleTag) TextIO(com.google.cloud.dataflow.sdk.io.TextIO) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) PCollection(com.google.cloud.dataflow.sdk.values.PCollection) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) ParDo(com.google.cloud.dataflow.sdk.transforms.ParDo) PCollectionTuple(com.google.cloud.dataflow.sdk.values.PCollectionTuple) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 4 with JavaRDDLike

use of org.apache.spark.api.java.JavaRDDLike in project camel by apache.

the class SparkProducerTest method shouldExecuteVoidCallback.

@Test
public void shouldExecuteVoidCallback() throws IOException {
    // Given
    final File output = File.createTempFile("camel", "spark");
    output.delete();
    // When
    template.sendBodyAndHeader(sparkUri, null, SPARK_RDD_CALLBACK_HEADER, new VoidRddCallback() {

        @Override
        public void doOnRdd(JavaRDDLike rdd, Object... payloads) {
            rdd.saveAsTextFile(output.getAbsolutePath());
        }
    });
    // Then
    Truth.assertThat(output.length()).isGreaterThan(0L);
}
Also used : JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) File(java.io.File) Test(org.junit.Test)

Example 5 with JavaRDDLike

use of org.apache.spark.api.java.JavaRDDLike in project spark-dataflow by cloudera.

the class TransformTranslator method combineGlobally.

private static <I, A, O> TransformEvaluator<Combine.Globally<I, O>> combineGlobally() {
    return new TransformEvaluator<Combine.Globally<I, O>>() {

        @Override
        public void evaluate(Combine.Globally<I, O> transform, EvaluationContext context) {
            final Combine.CombineFn<I, A, O> globally = COMBINE_GLOBALLY_FG.get("fn", transform);
            @SuppressWarnings("unchecked") JavaRDDLike<WindowedValue<I>, ?> inRdd = (JavaRDDLike<WindowedValue<I>, ?>) context.getInputRDD(transform);
            final Coder<I> iCoder = context.getInput(transform).getCoder();
            final Coder<A> aCoder;
            try {
                aCoder = globally.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
            } catch (CannotProvideCoderException e) {
                throw new IllegalStateException("Could not determine coder for accumulator", e);
            }
            // Use coders to convert objects in the PCollection to byte arrays, so they
            // can be transferred over the network for the shuffle.
            JavaRDD<byte[]> inRddBytes = inRdd.map(WindowingHelpers.<I>unwindowFunction()).map(CoderHelpers.toByteFunction(iCoder));
            /*A*/
            byte[] acc = inRddBytes.aggregate(CoderHelpers.toByteArray(globally.createAccumulator(), aCoder), new Function2<byte[], byte[], byte[]>() {

                @Override
                public byte[] call(/*A*/
                byte[] ab, /*I*/
                byte[] ib) throws Exception {
                    A a = CoderHelpers.fromByteArray(ab, aCoder);
                    I i = CoderHelpers.fromByteArray(ib, iCoder);
                    return CoderHelpers.toByteArray(globally.addInput(a, i), aCoder);
                }
            }, new Function2<byte[], byte[], byte[]>() {

                @Override
                public byte[] call(/*A*/
                byte[] a1b, /*A*/
                byte[] a2b) throws Exception {
                    A a1 = CoderHelpers.fromByteArray(a1b, aCoder);
                    A a2 = CoderHelpers.fromByteArray(a2b, aCoder);
                    // don't use Guava's ImmutableList.of as values may be null
                    List<A> accumulators = Collections.unmodifiableList(Arrays.asList(a1, a2));
                    A merged = globally.mergeAccumulators(accumulators);
                    return CoderHelpers.toByteArray(merged, aCoder);
                }
            });
            O output = globally.extractOutput(CoderHelpers.fromByteArray(acc, aCoder));
            Coder<O> coder = context.getOutput(transform).getCoder();
            JavaRDD<byte[]> outRdd = context.getSparkContext().parallelize(// don't use Guava's ImmutableList.of as output may be null
            CoderHelpers.toByteArrays(Collections.singleton(output), coder));
            context.setOutputRDD(transform, outRdd.map(CoderHelpers.fromByteFunction(coder)).map(WindowingHelpers.<O>windowFunction()));
        }
    };
}
Also used : Combine(com.google.cloud.dataflow.sdk.transforms.Combine) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) TextIO(com.google.cloud.dataflow.sdk.io.TextIO) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) List(java.util.List) PCollectionList(com.google.cloud.dataflow.sdk.values.PCollectionList)

Aggregations

JavaRDDLike (org.apache.spark.api.java.JavaRDDLike)5 HadoopIO (com.cloudera.dataflow.hadoop.HadoopIO)2 AvroIO (com.google.cloud.dataflow.sdk.io.AvroIO)2 TextIO (com.google.cloud.dataflow.sdk.io.TextIO)2 WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)2 List (java.util.List)2 CannotProvideCoderException (com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException)1 Combine (com.google.cloud.dataflow.sdk.transforms.Combine)1 ParDo (com.google.cloud.dataflow.sdk.transforms.ParDo)1 PCollection (com.google.cloud.dataflow.sdk.values.PCollection)1 PCollectionList (com.google.cloud.dataflow.sdk.values.PCollectionList)1 PCollectionTuple (com.google.cloud.dataflow.sdk.values.PCollectionTuple)1 TupleTag (com.google.cloud.dataflow.sdk.values.TupleTag)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 File (java.io.File)1 IOException (java.io.IOException)1 Map (java.util.Map)1 JndiRegistry (org.apache.camel.impl.JndiRegistry)1 JavaRDD (org.apache.spark.api.java.JavaRDD)1 DataFrame (org.apache.spark.sql.DataFrame)1