use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class SparkComputationGraph method calculateScoreMultiDataSet.
/**
* Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing
* or averaging over the entire data set.
* *
* @param data Data to score
* @param average Whether to sum the scores, or average them
* @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
* this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
* in one go)
*/
public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) {
JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize));
//Reduce to a single tuple, with example count + sum of scores
Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction());
if (average) {
return countAndSumScores._2() / countAndSumScores._1();
} else {
return countAndSumScores._2();
}
}
use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class GraphFeedForwardWithKeyFunctionAdapter method call.
@Override
public Iterable<Tuple2<K, INDArray[]>> call(Iterator<Tuple2<K, INDArray[]>> iterator) throws Exception {
if (!iterator.hasNext()) {
return Collections.emptyList();
}
ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(jsonConfig.getValue()));
network.init();
INDArray val = params.value().unsafeDuplication();
if (val.length() != network.numParams(false))
throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
network.setParams(val);
//Issue: for 2d data (MLPs etc) we can just stack the examples.
//But: for 3d and 4d: in principle the data sizes could be different
//We could handle that with mask arrays - but it gets messy. The approach used here is simpler but less efficient
List<INDArray[]> featuresList = new ArrayList<>(batchSize);
List<K> keyList = new ArrayList<>(batchSize);
List<Integer> origSizeList = new ArrayList<>();
int[][] firstShapes = null;
boolean sizesDiffer = false;
int tupleCount = 0;
while (iterator.hasNext()) {
Tuple2<K, INDArray[]> t2 = iterator.next();
if (firstShapes == null) {
firstShapes = new int[t2._2().length][0];
for (int i = 0; i < firstShapes.length; i++) {
firstShapes[i] = t2._2()[i].shape();
}
} else if (!sizesDiffer) {
for (int i = 0; i < firstShapes.length; i++) {
for (int j = 1; j < firstShapes[i].length; j++) {
if (firstShapes[i][j] != featuresList.get(tupleCount - 1)[i].size(j)) {
sizesDiffer = true;
break;
}
}
}
}
featuresList.add(t2._2());
keyList.add(t2._1());
origSizeList.add(t2._2()[0].size(0));
tupleCount++;
}
if (tupleCount == 0) {
return Collections.emptyList();
}
List<Tuple2<K, INDArray[]>> output = new ArrayList<>(tupleCount);
int currentArrayIndex = 0;
while (currentArrayIndex < featuresList.size()) {
int firstIdx = currentArrayIndex;
int nextIdx = currentArrayIndex;
int examplesInBatch = 0;
List<INDArray[]> toMerge = new ArrayList<>();
firstShapes = null;
while (nextIdx < featuresList.size() && examplesInBatch < batchSize) {
INDArray[] f = featuresList.get(nextIdx);
if (firstShapes == null) {
firstShapes = new int[f.length][0];
for (int i = 0; i < firstShapes.length; i++) {
firstShapes[i] = f[i].shape();
}
} else if (sizesDiffer) {
boolean breakWhile = false;
for (int i = 0; i < firstShapes.length; i++) {
for (int j = 1; j < firstShapes[i].length; j++) {
if (firstShapes[i][j] != featuresList.get(nextIdx)[i].size(j)) {
//Next example has a different size. So: don't add it to the current batch, just process what we have
breakWhile = true;
break;
}
}
}
if (breakWhile) {
break;
}
}
toMerge.add(f);
examplesInBatch += f[0].size(0);
nextIdx++;
}
INDArray[] batchFeatures = new INDArray[toMerge.get(0).length];
for (int i = 0; i < batchFeatures.length; i++) {
INDArray[] tempArr = new INDArray[toMerge.size()];
for (int j = 0; j < tempArr.length; j++) {
tempArr[j] = toMerge.get(j)[i];
}
batchFeatures[i] = Nd4j.concat(0, tempArr);
}
INDArray[] out = network.output(false, batchFeatures);
examplesInBatch = 0;
for (int i = firstIdx; i < nextIdx; i++) {
int numExamples = origSizeList.get(i);
INDArray[] outSubset = new INDArray[out.length];
for (int j = 0; j < out.length; j++) {
outSubset[j] = getSubset(examplesInBatch, examplesInBatch + numExamples, out[j]);
}
examplesInBatch += numExamples;
output.add(new Tuple2<>(keyList.get(i), outSubset));
}
currentArrayIndex += (nextIdx - firstIdx);
}
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
return output;
}
use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class ScoreExamplesWithKeyFunctionAdapter method call.
@Override
public Iterable<Tuple2<K, Double>> call(Iterator<Tuple2<K, DataSet>> iterator) throws Exception {
if (!iterator.hasNext()) {
return Collections.emptyList();
}
MultiLayerNetwork network = new MultiLayerNetwork(MultiLayerConfiguration.fromJson(jsonConfig.getValue()));
network.init();
INDArray val = params.value().unsafeDuplication();
if (val.length() != network.numParams(false))
throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
network.setParameters(val);
List<Tuple2<K, Double>> ret = new ArrayList<>();
List<DataSet> collect = new ArrayList<>(batchSize);
List<K> collectKey = new ArrayList<>(batchSize);
int totalCount = 0;
while (iterator.hasNext()) {
collect.clear();
collectKey.clear();
int nExamples = 0;
while (iterator.hasNext() && nExamples < batchSize) {
Tuple2<K, DataSet> t2 = iterator.next();
DataSet ds = t2._2();
int n = ds.numExamples();
if (n != 1)
throw new IllegalStateException("Cannot score examples with one key per data set if " + "data set contains more than 1 example (numExamples: " + n + ")");
collect.add(ds);
collectKey.add(t2._1());
nExamples += n;
}
totalCount += nExamples;
DataSet data = DataSet.merge(collect);
INDArray scores = network.scoreExamples(data, addRegularization);
double[] doubleScores = scores.data().asDouble();
for (int i = 0; i < doubleScores.length; i++) {
ret.add(new Tuple2<>(collectKey.get(i), doubleScores[i]));
}
}
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
if (log.isDebugEnabled()) {
log.debug("Scored {} examples ", totalCount);
}
return ret;
}
use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class ScoreFlatMapFunctionCGDataSetAdapter method call.
@Override
public Iterable<Tuple2<Integer, Double>> call(Iterator<DataSet> dataSetIterator) throws Exception {
if (!dataSetIterator.hasNext()) {
return Collections.singletonList(new Tuple2<>(0, 0.0));
}
//Does batching where appropriate
DataSetIterator iter = new IteratorDataSetIterator(dataSetIterator, minibatchSize);
ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(json));
network.init();
//.value() is shared by all executors on single machine -> OK, as params are not changed in score function
INDArray val = params.value().unsafeDuplication();
if (val.length() != network.numParams(false))
throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
network.setParams(val);
List<Tuple2<Integer, Double>> out = new ArrayList<>();
while (iter.hasNext()) {
DataSet ds = iter.next();
double score = network.score(ds, false);
int numExamples = ds.getFeatureMatrix().size(0);
out.add(new Tuple2<>(numExamples, score * numExamples));
}
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
return out;
}
use of scala.Tuple2 in project deeplearning4j by deeplearning4j.
the class ScoreFlatMapFunctionCGMultiDataSetAdapter method call.
@Override
public Iterable<Tuple2<Integer, Double>> call(Iterator<MultiDataSet> dataSetIterator) throws Exception {
if (!dataSetIterator.hasNext()) {
return Collections.singletonList(new Tuple2<>(0, 0.0));
}
//Does batching where appropriate
MultiDataSetIterator iter = new IteratorMultiDataSetIterator(dataSetIterator, minibatchSize);
ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(json));
network.init();
//.value() is shared by all executors on single machine -> OK, as params are not changed in score function
INDArray val = params.value().unsafeDuplication();
if (val.length() != network.numParams(false))
throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
network.setParams(val);
List<Tuple2<Integer, Double>> out = new ArrayList<>();
while (iter.hasNext()) {
MultiDataSet ds = iter.next();
double score = network.score(ds, false);
int numExamples = ds.getFeatures(0).size(0);
out.add(new Tuple2<>(numExamples, score * numExamples));
}
if (Nd4j.getExecutioner() instanceof GridExecutioner)
((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
return out;
}
Aggregations