Search in sources :

Example 26 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class SparkComputationGraph method calculateScoreMultiDataSet.

/**
     * Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing
     * or averaging over the entire data set.
     *      *
     * @param data          Data to score
     * @param average       Whether to sum the scores, or average them
     * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
     *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
     *                      in one go)
     */
public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize));
    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IntDoubleReduceFunction(org.deeplearning4j.spark.impl.common.reduce.IntDoubleReduceFunction) Tuple2(scala.Tuple2)

Example 27 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class GraphFeedForwardWithKeyFunctionAdapter method call.

@Override
public Iterable<Tuple2<K, INDArray[]>> call(Iterator<Tuple2<K, INDArray[]>> iterator) throws Exception {
    if (!iterator.hasNext()) {
        return Collections.emptyList();
    }
    ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(jsonConfig.getValue()));
    network.init();
    INDArray val = params.value().unsafeDuplication();
    if (val.length() != network.numParams(false))
        throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
    network.setParams(val);
    //Issue: for 2d data (MLPs etc) we can just stack the examples.
    //But: for 3d and 4d: in principle the data sizes could be different
    //We could handle that with mask arrays - but it gets messy. The approach used here is simpler but less efficient
    List<INDArray[]> featuresList = new ArrayList<>(batchSize);
    List<K> keyList = new ArrayList<>(batchSize);
    List<Integer> origSizeList = new ArrayList<>();
    int[][] firstShapes = null;
    boolean sizesDiffer = false;
    int tupleCount = 0;
    while (iterator.hasNext()) {
        Tuple2<K, INDArray[]> t2 = iterator.next();
        if (firstShapes == null) {
            firstShapes = new int[t2._2().length][0];
            for (int i = 0; i < firstShapes.length; i++) {
                firstShapes[i] = t2._2()[i].shape();
            }
        } else if (!sizesDiffer) {
            for (int i = 0; i < firstShapes.length; i++) {
                for (int j = 1; j < firstShapes[i].length; j++) {
                    if (firstShapes[i][j] != featuresList.get(tupleCount - 1)[i].size(j)) {
                        sizesDiffer = true;
                        break;
                    }
                }
            }
        }
        featuresList.add(t2._2());
        keyList.add(t2._1());
        origSizeList.add(t2._2()[0].size(0));
        tupleCount++;
    }
    if (tupleCount == 0) {
        return Collections.emptyList();
    }
    List<Tuple2<K, INDArray[]>> output = new ArrayList<>(tupleCount);
    int currentArrayIndex = 0;
    while (currentArrayIndex < featuresList.size()) {
        int firstIdx = currentArrayIndex;
        int nextIdx = currentArrayIndex;
        int examplesInBatch = 0;
        List<INDArray[]> toMerge = new ArrayList<>();
        firstShapes = null;
        while (nextIdx < featuresList.size() && examplesInBatch < batchSize) {
            INDArray[] f = featuresList.get(nextIdx);
            if (firstShapes == null) {
                firstShapes = new int[f.length][0];
                for (int i = 0; i < firstShapes.length; i++) {
                    firstShapes[i] = f[i].shape();
                }
            } else if (sizesDiffer) {
                boolean breakWhile = false;
                for (int i = 0; i < firstShapes.length; i++) {
                    for (int j = 1; j < firstShapes[i].length; j++) {
                        if (firstShapes[i][j] != featuresList.get(nextIdx)[i].size(j)) {
                            //Next example has a different size. So: don't add it to the current batch, just process what we have
                            breakWhile = true;
                            break;
                        }
                    }
                }
                if (breakWhile) {
                    break;
                }
            }
            toMerge.add(f);
            examplesInBatch += f[0].size(0);
            nextIdx++;
        }
        INDArray[] batchFeatures = new INDArray[toMerge.get(0).length];
        for (int i = 0; i < batchFeatures.length; i++) {
            INDArray[] tempArr = new INDArray[toMerge.size()];
            for (int j = 0; j < tempArr.length; j++) {
                tempArr[j] = toMerge.get(j)[i];
            }
            batchFeatures[i] = Nd4j.concat(0, tempArr);
        }
        INDArray[] out = network.output(false, batchFeatures);
        examplesInBatch = 0;
        for (int i = firstIdx; i < nextIdx; i++) {
            int numExamples = origSizeList.get(i);
            INDArray[] outSubset = new INDArray[out.length];
            for (int j = 0; j < out.length; j++) {
                outSubset[j] = getSubset(examplesInBatch, examplesInBatch + numExamples, out[j]);
            }
            examplesInBatch += numExamples;
            output.add(new Tuple2<>(keyList.get(i), outSubset));
        }
        currentArrayIndex += (nextIdx - firstIdx);
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
    return output;
}
Also used : ArrayList(java.util.ArrayList) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) ComputationGraph(org.deeplearning4j.nn.graph.ComputationGraph)

Example 28 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class ScoreExamplesWithKeyFunctionAdapter method call.

@Override
public Iterable<Tuple2<K, Double>> call(Iterator<Tuple2<K, DataSet>> iterator) throws Exception {
    if (!iterator.hasNext()) {
        return Collections.emptyList();
    }
    MultiLayerNetwork network = new MultiLayerNetwork(MultiLayerConfiguration.fromJson(jsonConfig.getValue()));
    network.init();
    INDArray val = params.value().unsafeDuplication();
    if (val.length() != network.numParams(false))
        throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
    network.setParameters(val);
    List<Tuple2<K, Double>> ret = new ArrayList<>();
    List<DataSet> collect = new ArrayList<>(batchSize);
    List<K> collectKey = new ArrayList<>(batchSize);
    int totalCount = 0;
    while (iterator.hasNext()) {
        collect.clear();
        collectKey.clear();
        int nExamples = 0;
        while (iterator.hasNext() && nExamples < batchSize) {
            Tuple2<K, DataSet> t2 = iterator.next();
            DataSet ds = t2._2();
            int n = ds.numExamples();
            if (n != 1)
                throw new IllegalStateException("Cannot score examples with one key per data set if " + "data set contains more than 1 example (numExamples: " + n + ")");
            collect.add(ds);
            collectKey.add(t2._1());
            nExamples += n;
        }
        totalCount += nExamples;
        DataSet data = DataSet.merge(collect);
        INDArray scores = network.scoreExamples(data, addRegularization);
        double[] doubleScores = scores.data().asDouble();
        for (int i = 0; i < doubleScores.length; i++) {
            ret.add(new Tuple2<>(collectKey.get(i), doubleScores[i]));
        }
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
    if (log.isDebugEnabled()) {
        log.debug("Scored {} examples ", totalCount);
    }
    return ret;
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) ArrayList(java.util.ArrayList) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) MultiLayerNetwork(org.deeplearning4j.nn.multilayer.MultiLayerNetwork)

Example 29 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class ScoreFlatMapFunctionCGDataSetAdapter method call.

@Override
public Iterable<Tuple2<Integer, Double>> call(Iterator<DataSet> dataSetIterator) throws Exception {
    if (!dataSetIterator.hasNext()) {
        return Collections.singletonList(new Tuple2<>(0, 0.0));
    }
    //Does batching where appropriate
    DataSetIterator iter = new IteratorDataSetIterator(dataSetIterator, minibatchSize);
    ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(json));
    network.init();
    //.value() is shared by all executors on single machine -> OK, as params are not changed in score function
    INDArray val = params.value().unsafeDuplication();
    if (val.length() != network.numParams(false))
        throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
    network.setParams(val);
    List<Tuple2<Integer, Double>> out = new ArrayList<>();
    while (iter.hasNext()) {
        DataSet ds = iter.next();
        double score = network.score(ds, false);
        int numExamples = ds.getFeatureMatrix().size(0);
        out.add(new Tuple2<>(numExamples, score * numExamples));
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
    return out;
}
Also used : DataSet(org.nd4j.linalg.dataset.DataSet) ArrayList(java.util.ArrayList) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Tuple2(scala.Tuple2) ComputationGraph(org.deeplearning4j.nn.graph.ComputationGraph) IteratorDataSetIterator(org.deeplearning4j.datasets.iterator.IteratorDataSetIterator) IteratorDataSetIterator(org.deeplearning4j.datasets.iterator.IteratorDataSetIterator) DataSetIterator(org.nd4j.linalg.dataset.api.iterator.DataSetIterator)

Example 30 with Tuple2

use of scala.Tuple2 in project deeplearning4j by deeplearning4j.

the class ScoreFlatMapFunctionCGMultiDataSetAdapter method call.

@Override
public Iterable<Tuple2<Integer, Double>> call(Iterator<MultiDataSet> dataSetIterator) throws Exception {
    if (!dataSetIterator.hasNext()) {
        return Collections.singletonList(new Tuple2<>(0, 0.0));
    }
    //Does batching where appropriate
    MultiDataSetIterator iter = new IteratorMultiDataSetIterator(dataSetIterator, minibatchSize);
    ComputationGraph network = new ComputationGraph(ComputationGraphConfiguration.fromJson(json));
    network.init();
    //.value() is shared by all executors on single machine -> OK, as params are not changed in score function
    INDArray val = params.value().unsafeDuplication();
    if (val.length() != network.numParams(false))
        throw new IllegalStateException("Network did not have same number of parameters as the broadcast set parameters");
    network.setParams(val);
    List<Tuple2<Integer, Double>> out = new ArrayList<>();
    while (iter.hasNext()) {
        MultiDataSet ds = iter.next();
        double score = network.score(ds, false);
        int numExamples = ds.getFeatures(0).size(0);
        out.add(new Tuple2<>(numExamples, score * numExamples));
    }
    if (Nd4j.getExecutioner() instanceof GridExecutioner)
        ((GridExecutioner) Nd4j.getExecutioner()).flushQueueBlocking();
    return out;
}
Also used : IteratorMultiDataSetIterator(org.deeplearning4j.datasets.iterator.IteratorMultiDataSetIterator) ArrayList(java.util.ArrayList) IteratorMultiDataSetIterator(org.deeplearning4j.datasets.iterator.IteratorMultiDataSetIterator) MultiDataSetIterator(org.nd4j.linalg.dataset.api.iterator.MultiDataSetIterator) GridExecutioner(org.nd4j.linalg.api.ops.executioner.GridExecutioner) INDArray(org.nd4j.linalg.api.ndarray.INDArray) MultiDataSet(org.nd4j.linalg.dataset.api.MultiDataSet) Tuple2(scala.Tuple2) ComputationGraph(org.deeplearning4j.nn.graph.ComputationGraph)

Aggregations

Tuple2 (scala.Tuple2)181 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)57 ArrayList (java.util.ArrayList)43 IOException (java.io.IOException)32 Test (org.junit.Test)32 INDArray (org.nd4j.linalg.api.ndarray.INDArray)28 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 List (java.util.List)22 Function (org.apache.spark.api.java.function.Function)19 File (java.io.File)18 Collectors (java.util.stream.Collectors)18 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)18 Configuration (org.apache.hadoop.conf.Configuration)17 UserException (org.broadinstitute.hellbender.exceptions.UserException)17 Broadcast (org.apache.spark.broadcast.Broadcast)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)16 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)16 SparkConf (org.apache.spark.SparkConf)15 JavaRDD (org.apache.spark.api.java.JavaRDD)15 VisibleForTesting (com.google.common.annotations.VisibleForTesting)14