Search in sources :

Example 41 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testZipFunction1.

/**
     * This test checked generations retrieved using stopWords
     *
     * @throws Exception
     */
@Test
public void testZipFunction1() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
    List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();
    List<VocabWord> vocabWordsList1 = lst.get(0)._1();
    Long cumSumSize1 = lst.get(0)._2();
    assertEquals(3, vocabWordsList1.size());
    assertEquals(vocabWordsList1.get(0).getWord(), "strange");
    assertEquals(vocabWordsList1.get(1).getWord(), "strange");
    assertEquals(vocabWordsList1.get(2).getWord(), "world");
    assertEquals(cumSumSize1, 6L, 0);
    List<VocabWord> vocabWordsList2 = lst.get(1)._1();
    Long cumSumSize2 = lst.get(1)._2();
    assertEquals(2, vocabWordsList2.size());
    assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
    assertEquals(vocabWordsList2.get(1).getWord(), "red");
    assertEquals(cumSumSize2, 9L, 0);
    sc.stop();
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) AtomicLong(java.util.concurrent.atomic.AtomicLong) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong) CountCumSum(org.deeplearning4j.spark.text.functions.CountCumSum) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.junit.Test)

Example 42 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class TextPipelineTest method testZipFunction2.

@Test
public void testZipFunction2() throws Exception {
    JavaSparkContext sc = getContext();
    JavaRDD<String> corpusRDD = getCorpusRDD(sc);
    //  word2vec.setRemoveStop(false);
    Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());
    TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
    pipeline.buildVocabCache();
    pipeline.buildVocabWordListRDD();
    JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
    JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
    CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
    JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
    JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
    List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();
    List<VocabWord> vocabWordsList1 = lst.get(0)._1();
    Long cumSumSize1 = lst.get(0)._2();
    assertEquals(6, vocabWordsList1.size());
    assertEquals(vocabWordsList1.get(0).getWord(), "this");
    assertEquals(vocabWordsList1.get(1).getWord(), "is");
    assertEquals(vocabWordsList1.get(2).getWord(), "a");
    assertEquals(vocabWordsList1.get(3).getWord(), "strange");
    assertEquals(vocabWordsList1.get(4).getWord(), "strange");
    assertEquals(vocabWordsList1.get(5).getWord(), "world");
    assertEquals(cumSumSize1, 6L, 0);
    List<VocabWord> vocabWordsList2 = lst.get(1)._1();
    Long cumSumSize2 = lst.get(1)._2();
    assertEquals(vocabWordsList2.size(), 3);
    assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
    assertEquals(vocabWordsList2.get(1).getWord(), "are");
    assertEquals(vocabWordsList2.get(2).getWord(), "red");
    assertEquals(cumSumSize2, 9L, 0);
    sc.stop();
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) TextPipeline(org.deeplearning4j.spark.text.functions.TextPipeline) AtomicLong(java.util.concurrent.atomic.AtomicLong) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong) CountCumSum(org.deeplearning4j.spark.text.functions.CountCumSum) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Test(org.junit.Test)

Example 43 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class TestSparkMultiLayerParameterAveraging method testDistributedScoring.

@Test
public void testDistributedScoring() {
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(true).l1(0.1).l2(0.1).seed(123).updater(Updater.NESTEROVS).learningRate(0.1).momentum(0.9).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(nIn).nOut(3).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(3).nOut(nOut).activation(Activation.SOFTMAX).build()).backprop(true).pretrain(false).build();
    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0));
    MultiLayerNetwork netCopy = sparkNet.getNetwork().clone();
    int nRows = 100;
    INDArray features = Nd4j.rand(nRows, nIn);
    INDArray labels = Nd4j.zeros(nRows, nOut);
    Random r = new Random(12345);
    for (int i = 0; i < nRows; i++) {
        labels.putScalar(new int[] { i, r.nextInt(nOut) }, 1.0);
    }
    INDArray localScoresWithReg = netCopy.scoreExamples(new DataSet(features, labels), true);
    INDArray localScoresNoReg = netCopy.scoreExamples(new DataSet(features, labels), false);
    List<Tuple2<String, DataSet>> dataWithKeys = new ArrayList<>();
    for (int i = 0; i < nRows; i++) {
        DataSet ds = new DataSet(features.getRow(i).dup(), labels.getRow(i).dup());
        dataWithKeys.add(new Tuple2<>(String.valueOf(i), ds));
    }
    JavaPairRDD<String, DataSet> dataWithKeysRdd = sc.parallelizePairs(dataWithKeys);
    JavaPairRDD<String, Double> sparkScoresWithReg = sparkNet.scoreExamples(dataWithKeysRdd, true, 4);
    JavaPairRDD<String, Double> sparkScoresNoReg = sparkNet.scoreExamples(dataWithKeysRdd, false, 4);
    Map<String, Double> sparkScoresWithRegMap = sparkScoresWithReg.collectAsMap();
    Map<String, Double> sparkScoresNoRegMap = sparkScoresNoReg.collectAsMap();
    for (int i = 0; i < nRows; i++) {
        double scoreRegExp = localScoresWithReg.getDouble(i);
        double scoreRegAct = sparkScoresWithRegMap.get(String.valueOf(i));
        assertEquals(scoreRegExp, scoreRegAct, 1e-5);
        double scoreNoRegExp = localScoresNoReg.getDouble(i);
        double scoreNoRegAct = sparkScoresNoRegMap.get(String.valueOf(i));
        assertEquals(scoreNoRegExp, scoreNoRegAct, 1e-5);
    //            System.out.println(scoreRegExp + "\t" + scoreRegAct + "\t" + scoreNoRegExp + "\t" + scoreNoRegAct);
    }
    List<DataSet> dataNoKeys = new ArrayList<>();
    for (int i = 0; i < nRows; i++) {
        dataNoKeys.add(new DataSet(features.getRow(i).dup(), labels.getRow(i).dup()));
    }
    JavaRDD<DataSet> dataNoKeysRdd = sc.parallelize(dataNoKeys);
    List<Double> scoresWithReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, true, 4).collect());
    List<Double> scoresNoReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, false, 4).collect());
    Collections.sort(scoresWithReg);
    Collections.sort(scoresNoReg);
    double[] localScoresWithRegDouble = localScoresWithReg.data().asDouble();
    double[] localScoresNoRegDouble = localScoresNoReg.data().asDouble();
    Arrays.sort(localScoresWithRegDouble);
    Arrays.sort(localScoresNoRegDouble);
    for (int i = 0; i < localScoresWithRegDouble.length; i++) {
        assertEquals(localScoresWithRegDouble[i], scoresWithReg.get(i), 1e-5);
        assertEquals(localScoresNoRegDouble[i], scoresNoReg.get(i), 1e-5);
    //System.out.println(localScoresWithRegDouble[i] + "\t" + scoresWithReg.get(i) + "\t" + localScoresNoRegDouble[i] + "\t" + scoresNoReg.get(i));
    }
}
Also used : MultiDataSet(org.nd4j.linalg.dataset.MultiDataSet) DataSet(org.nd4j.linalg.dataset.DataSet) MultiLayerConfiguration(org.deeplearning4j.nn.conf.MultiLayerConfiguration) SparkDl4jMultiLayer(org.deeplearning4j.spark.impl.multilayer.SparkDl4jMultiLayer) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork) NeuralNetConfiguration(org.deeplearning4j.nn.conf.NeuralNetConfiguration) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) DenseLayer(org.deeplearning4j.nn.conf.layers.DenseLayer) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) BaseSparkTest(org.deeplearning4j.spark.BaseSparkTest) Test(org.junit.Test)

Example 44 with Tuple2

use of scala.Tuple2 in project tdi-studio-se by Talend.

the class KeyByCompareColFunction method call.

public Tuple2<List<Object>, List<Object>> call(List<Object> d) {
    List<Object> key = new ArrayList<Object>(compCols.size());
    for (int i = 0; i < compCols.size(); i++) {
        key.add(i, d.get(compCols.get(i).getColId()));
    }
    List<Object> values = new ArrayList<Object>(d.size());
    int valId = 0;
    for (Integer i = 0; i < d.size(); i++) {
        boolean contain = false;
        for (CompareCol compCol : compCols) {
            if (compCol.getColId().equals(i)) {
                contain = true;
                break;
            }
        }
        if (!contain) {
            values.add(valId, d.get(i));
            valId++;
        }
    }
    return new Tuple2<List<Object>, List<Object>>(key, values);
}
Also used : CompareCol(org.talend.spark.utils.CompareCol) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList)

Example 45 with Tuple2

use of scala.Tuple2 in project tdi-studio-se by Talend.

the class HBaseStore method run.

public static void run(String zookeeperHost, String zookeeperPort, String table, final String columns, Map<String, String> properties, TalendRDD<List<Object>> rdd, final List<Integer> keyList) throws IOException {
    Configuration conf = HBaseConfiguration.create();
    conf.set("hbase.zookeeper.quorum", zookeeperHost);
    conf.set("hbase.zookeeper.property.clientPort", zookeeperPort);
    conf.set("hbase.mapred.tablecolumns", columns);
    for (Entry<String, String> e : properties.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }
    TalendPairRDD<ImmutableBytesWritable, Put> hbaseRdd = rdd.mapToPair(new PairFunction<List<Object>, ImmutableBytesWritable, Put>() {

        private static final long serialVersionUID = 1L;

        public Tuple2<ImmutableBytesWritable, Put> call(List<Object> t) throws Exception {
            String key = "";
            for (int i : keyList) {
                key = key + t.get(i);
            }
            org.apache.hadoop.hbase.client.Put put = new org.apache.hadoop.hbase.client.Put(DigestUtils.md5("".equals(key) ? t.toString() : key));
            String[] cols = columns.split(" ");
            int i = 0;
            for (Object o : t) {
                if (cols.length > i) {
                    put.add(org.apache.hadoop.hbase.util.Bytes.toBytes(cols[i].split(":")[0]), org.apache.hadoop.hbase.util.Bytes.toBytes(cols[i].split(":")[1]), (o != null ? org.apache.hadoop.hbase.util.Bytes.toBytes(o.toString()) : null));
                }
                i++;
            }
            return new Tuple2<ImmutableBytesWritable, Put>(new ImmutableBytesWritable(), put);
        }
    });
    JobConf config = new JobConf(conf);
    config.set(TableOutputFormat.OUTPUT_TABLE, table);
    config.setOutputFormat(TableOutputFormat.class);
    hbaseRdd.saveAsHadoopDataset(config);
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) Configuration(org.apache.hadoop.conf.Configuration) Put(org.apache.hadoop.hbase.client.Put) Put(org.apache.hadoop.hbase.client.Put) IOException(java.io.IOException) Tuple2(scala.Tuple2) List(java.util.List) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

Tuple2 (scala.Tuple2)183 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)57 ArrayList (java.util.ArrayList)44 IOException (java.io.IOException)32 Test (org.junit.Test)32 INDArray (org.nd4j.linalg.api.ndarray.INDArray)28 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 List (java.util.List)22 Function (org.apache.spark.api.java.function.Function)19 File (java.io.File)18 Collectors (java.util.stream.Collectors)18 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)18 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)18 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)18 Configuration (org.apache.hadoop.conf.Configuration)17 UserException (org.broadinstitute.hellbender.exceptions.UserException)17 Broadcast (org.apache.spark.broadcast.Broadcast)16 SparkConf (org.apache.spark.SparkConf)15 JavaRDD (org.apache.spark.api.java.JavaRDD)15 VisibleForTesting (com.google.common.annotations.VisibleForTesting)14