use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testZipFunction1.
/**
* This test checked generations retrieved using stopWords
*
* @throws Exception
*/
@Test
public void testZipFunction1() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
// word2vec.setRemoveStop(false);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
pipeline.buildVocabWordListRDD();
JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();
List<VocabWord> vocabWordsList1 = lst.get(0)._1();
Long cumSumSize1 = lst.get(0)._2();
assertEquals(3, vocabWordsList1.size());
assertEquals(vocabWordsList1.get(0).getWord(), "strange");
assertEquals(vocabWordsList1.get(1).getWord(), "strange");
assertEquals(vocabWordsList1.get(2).getWord(), "world");
assertEquals(cumSumSize1, 6L, 0);
List<VocabWord> vocabWordsList2 = lst.get(1)._1();
Long cumSumSize2 = lst.get(1)._2();
assertEquals(2, vocabWordsList2.size());
assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
assertEquals(vocabWordsList2.get(1).getWord(), "red");
assertEquals(cumSumSize2, 9L, 0);
sc.stop();
}
use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class TextPipelineTest method testZipFunction2.
@Test
public void testZipFunction2() throws Exception {
JavaSparkContext sc = getContext();
JavaRDD<String> corpusRDD = getCorpusRDD(sc);
// word2vec.setRemoveStop(false);
Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap());
TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap);
pipeline.buildVocabCache();
pipeline.buildVocabWordListRDD();
JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD();
JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD();
CountCumSum countCumSum = new CountCumSum(sentenceCountRDD);
JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum();
JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD);
List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect();
List<VocabWord> vocabWordsList1 = lst.get(0)._1();
Long cumSumSize1 = lst.get(0)._2();
assertEquals(6, vocabWordsList1.size());
assertEquals(vocabWordsList1.get(0).getWord(), "this");
assertEquals(vocabWordsList1.get(1).getWord(), "is");
assertEquals(vocabWordsList1.get(2).getWord(), "a");
assertEquals(vocabWordsList1.get(3).getWord(), "strange");
assertEquals(vocabWordsList1.get(4).getWord(), "strange");
assertEquals(vocabWordsList1.get(5).getWord(), "world");
assertEquals(cumSumSize1, 6L, 0);
List<VocabWord> vocabWordsList2 = lst.get(1)._1();
Long cumSumSize2 = lst.get(1)._2();
assertEquals(vocabWordsList2.size(), 3);
assertEquals(vocabWordsList2.get(0).getWord(), "flowers");
assertEquals(vocabWordsList2.get(1).getWord(), "are");
assertEquals(vocabWordsList2.get(2).getWord(), "red");
assertEquals(cumSumSize2, 9L, 0);
sc.stop();
}
use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class TestSparkMultiLayerParameterAveraging method testDistributedScoring.
@Test
public void testDistributedScoring() {
MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(true).l1(0.1).l2(0.1).seed(123).updater(Updater.NESTEROVS).learningRate(0.1).momentum(0.9).list().layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(nIn).nOut(3).activation(Activation.TANH).build()).layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(3).nOut(nOut).activation(Activation.SOFTMAX).build()).backprop(true).pretrain(false).build();
SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 10, 1, 0));
MultiLayerNetwork netCopy = sparkNet.getNetwork().clone();
int nRows = 100;
INDArray features = Nd4j.rand(nRows, nIn);
INDArray labels = Nd4j.zeros(nRows, nOut);
Random r = new Random(12345);
for (int i = 0; i < nRows; i++) {
labels.putScalar(new int[] { i, r.nextInt(nOut) }, 1.0);
}
INDArray localScoresWithReg = netCopy.scoreExamples(new DataSet(features, labels), true);
INDArray localScoresNoReg = netCopy.scoreExamples(new DataSet(features, labels), false);
List<Tuple2<String, DataSet>> dataWithKeys = new ArrayList<>();
for (int i = 0; i < nRows; i++) {
DataSet ds = new DataSet(features.getRow(i).dup(), labels.getRow(i).dup());
dataWithKeys.add(new Tuple2<>(String.valueOf(i), ds));
}
JavaPairRDD<String, DataSet> dataWithKeysRdd = sc.parallelizePairs(dataWithKeys);
JavaPairRDD<String, Double> sparkScoresWithReg = sparkNet.scoreExamples(dataWithKeysRdd, true, 4);
JavaPairRDD<String, Double> sparkScoresNoReg = sparkNet.scoreExamples(dataWithKeysRdd, false, 4);
Map<String, Double> sparkScoresWithRegMap = sparkScoresWithReg.collectAsMap();
Map<String, Double> sparkScoresNoRegMap = sparkScoresNoReg.collectAsMap();
for (int i = 0; i < nRows; i++) {
double scoreRegExp = localScoresWithReg.getDouble(i);
double scoreRegAct = sparkScoresWithRegMap.get(String.valueOf(i));
assertEquals(scoreRegExp, scoreRegAct, 1e-5);
double scoreNoRegExp = localScoresNoReg.getDouble(i);
double scoreNoRegAct = sparkScoresNoRegMap.get(String.valueOf(i));
assertEquals(scoreNoRegExp, scoreNoRegAct, 1e-5);
// System.out.println(scoreRegExp + "\t" + scoreRegAct + "\t" + scoreNoRegExp + "\t" + scoreNoRegAct);
}
List<DataSet> dataNoKeys = new ArrayList<>();
for (int i = 0; i < nRows; i++) {
dataNoKeys.add(new DataSet(features.getRow(i).dup(), labels.getRow(i).dup()));
}
JavaRDD<DataSet> dataNoKeysRdd = sc.parallelize(dataNoKeys);
List<Double> scoresWithReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, true, 4).collect());
List<Double> scoresNoReg = new ArrayList<>(sparkNet.scoreExamples(dataNoKeysRdd, false, 4).collect());
Collections.sort(scoresWithReg);
Collections.sort(scoresNoReg);
double[] localScoresWithRegDouble = localScoresWithReg.data().asDouble();
double[] localScoresNoRegDouble = localScoresNoReg.data().asDouble();
Arrays.sort(localScoresWithRegDouble);
Arrays.sort(localScoresNoRegDouble);
for (int i = 0; i < localScoresWithRegDouble.length; i++) {
assertEquals(localScoresWithRegDouble[i], scoresWithReg.get(i), 1e-5);
assertEquals(localScoresNoRegDouble[i], scoresNoReg.get(i), 1e-5);
//System.out.println(localScoresWithRegDouble[i] + "\t" + scoresWithReg.get(i) + "\t" + localScoresNoRegDouble[i] + "\t" + scoresNoReg.get(i));
}
}
use of scala.Tuple2 in project tdi-studio-se by Talend.
the class KeyByCompareColFunction method call.
public Tuple2<List<Object>, List<Object>> call(List<Object> d) {
List<Object> key = new ArrayList<Object>(compCols.size());
for (int i = 0; i < compCols.size(); i++) {
key.add(i, d.get(compCols.get(i).getColId()));
}
List<Object> values = new ArrayList<Object>(d.size());
int valId = 0;
for (Integer i = 0; i < d.size(); i++) {
boolean contain = false;
for (CompareCol compCol : compCols) {
if (compCol.getColId().equals(i)) {
contain = true;
break;
}
}
if (!contain) {
values.add(valId, d.get(i));
valId++;
}
}
return new Tuple2<List<Object>, List<Object>>(key, values);
}
use of scala.Tuple2 in project tdi-studio-se by Talend.
the class HBaseStore method run.
public static void run(String zookeeperHost, String zookeeperPort, String table, final String columns, Map<String, String> properties, TalendRDD<List<Object>> rdd, final List<Integer> keyList) throws IOException {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", zookeeperHost);
conf.set("hbase.zookeeper.property.clientPort", zookeeperPort);
conf.set("hbase.mapred.tablecolumns", columns);
for (Entry<String, String> e : properties.entrySet()) {
conf.set(e.getKey(), e.getValue());
}
TalendPairRDD<ImmutableBytesWritable, Put> hbaseRdd = rdd.mapToPair(new PairFunction<List<Object>, ImmutableBytesWritable, Put>() {
private static final long serialVersionUID = 1L;
public Tuple2<ImmutableBytesWritable, Put> call(List<Object> t) throws Exception {
String key = "";
for (int i : keyList) {
key = key + t.get(i);
}
org.apache.hadoop.hbase.client.Put put = new org.apache.hadoop.hbase.client.Put(DigestUtils.md5("".equals(key) ? t.toString() : key));
String[] cols = columns.split(" ");
int i = 0;
for (Object o : t) {
if (cols.length > i) {
put.add(org.apache.hadoop.hbase.util.Bytes.toBytes(cols[i].split(":")[0]), org.apache.hadoop.hbase.util.Bytes.toBytes(cols[i].split(":")[1]), (o != null ? org.apache.hadoop.hbase.util.Bytes.toBytes(o.toString()) : null));
}
i++;
}
return new Tuple2<ImmutableBytesWritable, Put>(new ImmutableBytesWritable(), put);
}
});
JobConf config = new JobConf(conf);
config.set(TableOutputFormat.OUTPUT_TABLE, table);
config.setOutputFormat(TableOutputFormat.class);
hbaseRdd.saveAsHadoopDataset(config);
}
Aggregations