Search in sources :

Example 56 with DeltaSet

use of com.simiacryptus.mindseye.lang.DeltaSet in project MindsEye by SimiaCryptus.

the class ProductLayer method evalAndFree.

@Nullable
@Override
public Result evalAndFree(@Nonnull final Result... inObj) {
    if (!CudaSystem.isEnabled())
        return getCompatibilityLayer().evalAndFree(inObj);
    if (inObj.length != 2) {
        throw new IllegalArgumentException("inObj.length=" + inObj.length);
    }
    Result left = inObj[0];
    Result right = inObj[1];
    final TensorList leftData = left.getData();
    final TensorList rightData = right.getData();
    @Nonnull final int[] leftDimensions = leftData.getDimensions();
    @Nonnull final int[] rightDimensions = rightData.getDimensions();
    final int length = leftData.length();
    if (3 != leftDimensions.length) {
        throw new IllegalArgumentException("dimensions=" + Arrays.toString(leftDimensions));
    }
    return new Result(CudaSystem.run(gpu -> {
        @Nonnull final CudaResource<cudnnOpTensorDescriptor> opDescriptor = gpu.newOpDescriptor(cudnnOpTensorOp.CUDNN_OP_TENSOR_MUL, precision);
        @Nonnull final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, length, leftDimensions[2], leftDimensions[1], leftDimensions[0], leftDimensions[2] * leftDimensions[1] * leftDimensions[0], leftDimensions[1] * leftDimensions[0], leftDimensions[0], 1);
        @Nullable final CudaTensor lPtr = gpu.getTensor(leftData, precision, MemoryType.Device, false);
        @Nullable final CudaTensor rPtr = gpu.getTensor(rightData, precision, MemoryType.Device, false);
        // assert lPtr.size == rPtr.size;
        @Nonnull final CudaMemory outputPtr = gpu.allocate((long) precision.size * outputDescriptor.nStride * length, MemoryType.Device, true);
        CudaMemory lPtrMemory = lPtr.getMemory(gpu);
        CudaMemory rPtrMemory = rPtr.getMemory(gpu);
        CudaSystem.handle(gpu.cudnnOpTensor(opDescriptor.getPtr(), precision.getPointer(1.0), lPtr.descriptor.getPtr(), lPtrMemory.getPtr(), precision.getPointer(1.0), rPtr.descriptor.getPtr(), rPtrMemory.getPtr(), precision.getPointer(0.0), outputDescriptor.getPtr(), outputPtr.getPtr()));
        assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
        lPtrMemory.dirty();
        rPtrMemory.dirty();
        outputPtr.dirty();
        lPtrMemory.freeRef();
        rPtrMemory.freeRef();
        rPtr.freeRef();
        lPtr.freeRef();
        opDescriptor.freeRef();
        CudaTensor cudaTensor = CudaTensor.wrap(outputPtr, outputDescriptor, precision);
        return CudaTensorList.wrap(cudaTensor, length, leftDimensions, precision);
    }, leftData), (@Nonnull final DeltaSet<Layer> buffer, @Nonnull final TensorList delta) -> {
        if (left.isAlive()) {
            @Nonnull TensorList data = CudaSystem.run(gpu -> {
                @Nonnull final CudaResource<cudnnOpTensorDescriptor> opDescriptor = gpu.newOpDescriptor(cudnnOpTensorOp.CUDNN_OP_TENSOR_MUL, precision);
                @Nonnull final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, length, leftDimensions[2], leftDimensions[1], leftDimensions[0], leftDimensions[2] * leftDimensions[1] * leftDimensions[0], leftDimensions[1] * leftDimensions[0], leftDimensions[0], 1);
                @Nullable final CudaTensor deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, false);
                @Nullable final CudaTensor rightTensor = gpu.getTensor(right.getData(), precision, MemoryType.Device, false);
                // assert deltaTensor.size == rightTensor.size;
                @Nonnull final CudaMemory outputPtr = gpu.allocate((long) precision.size * outputDescriptor.nStride * length, MemoryType.Device, true);
                CudaMemory deltaTensorMemory = deltaTensor.getMemory(gpu);
                CudaMemory rightTensorMemory = rightTensor.getMemory(gpu);
                CudaSystem.handle(gpu.cudnnOpTensor(opDescriptor.getPtr(), precision.getPointer(1.0), deltaTensor.descriptor.getPtr(), deltaTensorMemory.getPtr(), precision.getPointer(1.0), rightTensor.descriptor.getPtr(), rightTensorMemory.getPtr(), precision.getPointer(0.0), outputDescriptor.getPtr(), outputPtr.getPtr()));
                deltaTensorMemory.dirty();
                rightTensorMemory.dirty();
                outputPtr.dirty();
                deltaTensorMemory.freeRef();
                rightTensorMemory.freeRef();
                CudaTensor cudaTensor = new CudaTensor(outputPtr, outputDescriptor, precision);
                Arrays.stream(new ReferenceCounting[] { deltaTensor, rightTensor, opDescriptor, outputDescriptor }).forEach(ReferenceCounting::freeRef);
                outputPtr.freeRef();
                return CudaTensorList.wrap(cudaTensor, length, leftDimensions, precision);
            }, delta);
            left.accumulate(buffer, data);
        }
        if (right.isAlive()) {
            @Nonnull TensorList data = CudaSystem.run(gpu -> {
                @Nonnull final CudaResource<cudnnOpTensorDescriptor> opDescriptor = gpu.newOpDescriptor(cudnnOpTensorOp.CUDNN_OP_TENSOR_MUL, precision);
                @Nonnull final CudaDevice.CudaTensorDescriptor expandedDescriptor = gpu.newTensorDescriptor(precision, length, leftDimensions[2], leftDimensions[1], leftDimensions[0], leftDimensions[2] * leftDimensions[1] * leftDimensions[0], leftDimensions[1] * leftDimensions[0], leftDimensions[0], 1);
                @Nullable final CudaTensor deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, false);
                delta.freeRef();
                @Nullable final CudaTensor leftTensor = gpu.getTensor(left.getData(), precision, MemoryType.Device, false);
                // assert deltaTensor.size == rightTensor.size;
                @Nonnull final CudaMemory outputPtr = gpu.allocate((long) precision.size * expandedDescriptor.nStride * length, MemoryType.Device, true);
                CudaMemory deltaTensorMemory = deltaTensor.getMemory(gpu);
                CudaMemory leftTensorMemory = leftTensor.getMemory(gpu);
                CudaSystem.handle(gpu.cudnnOpTensor(opDescriptor.getPtr(), precision.getPointer(1.0), deltaTensor.descriptor.getPtr(), deltaTensorMemory.getPtr(), precision.getPointer(1.0), leftTensor.descriptor.getPtr(), leftTensorMemory.getPtr(), precision.getPointer(0.0), expandedDescriptor.getPtr(), outputPtr.getPtr()));
                deltaTensorMemory.dirty();
                leftTensorMemory.dirty();
                outputPtr.dirty();
                if (Arrays.equals(rightDimensions, leftDimensions) && length == rightData.length()) {
                    deltaTensorMemory.freeRef();
                    leftTensorMemory.freeRef();
                    assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
                    outputPtr.dirty();
                    CudaTensor cudaTensor = new CudaTensor(outputPtr, expandedDescriptor, precision);
                    Stream.of(deltaTensor, leftTensor, opDescriptor, expandedDescriptor, outputPtr).forEach(ReferenceCounting::freeRef);
                    CudaTensorList tensorList = CudaTensorList.wrap(cudaTensor, length, rightDimensions, precision);
                    return tensorList;
                } else {
                    @Nonnull final CudaDevice.CudaTensorDescriptor reducedOutputDescriptor = gpu.newTensorDescriptor(precision, rightData.length(), rightDimensions[2], rightDimensions[1], rightDimensions[0], rightDimensions[2] * rightDimensions[1] * rightDimensions[0], rightDimensions[1] * rightDimensions[0], rightDimensions[0], 1);
                    long size = (long) precision.size * reducedOutputDescriptor.nStride * rightData.length();
                    @Nonnull final CudaMemory reducedOutputPtr = gpu.allocate(size, MemoryType.Managed, true);
                    CudaResource<cudnnReduceTensorDescriptor> reduceTensorDescriptor = gpu.cudnnCreateReduceTensorDescriptor(cudnnReduceTensorOp.CUDNN_REDUCE_TENSOR_ADD, precision.code, cudnnNanPropagation.CUDNN_NOT_PROPAGATE_NAN, cudnnReduceTensorIndices.CUDNN_REDUCE_TENSOR_NO_INDICES, cudnnIndicesType.CUDNN_32BIT_INDICES);
                    @Nonnull final CudaMemory workspacePtr = gpu.allocate(outputPtr.size, MemoryType.Device, true);
                    @Nonnull final CudaMemory indexPtr = gpu.allocate(3, MemoryType.Device, false);
                    // outputPtr.synchronize();
                    gpu.cudnnReduceTensor(reduceTensorDescriptor.getPtr(), indexPtr.getPtr(), indexPtr.size, workspacePtr.getPtr(), workspacePtr.size, precision.getPointer(1.0), expandedDescriptor.getPtr(), outputPtr.getPtr(), precision.getPointer(0.0), reducedOutputDescriptor.getPtr(), reducedOutputPtr.getPtr());
                    reducedOutputPtr.dirty();
                    workspacePtr.dirty();
                    outputPtr.dirty();
                    deltaTensorMemory.freeRef();
                    leftTensorMemory.freeRef();
                    CudaTensor cudaTensor = new CudaTensor(reducedOutputPtr, reducedOutputDescriptor, precision);
                    Stream.of(deltaTensor, leftTensor, opDescriptor, expandedDescriptor, outputPtr, reducedOutputPtr, reducedOutputDescriptor, reduceTensorDescriptor, workspacePtr, indexPtr).forEach(ReferenceCounting::freeRef);
                    CudaTensorList tensorList = CudaTensorList.wrap(cudaTensor, rightData.length(), rightDimensions, precision);
                    return tensorList;
                }
            }, delta);
            right.accumulate(buffer, data);
        } else {
            delta.freeRef();
        }
    }) {

        @Override
        public void accumulate(final DeltaSet<Layer> buffer, final TensorList delta) {
            getAccumulator().accept(buffer, delta);
        }

        @Override
        protected void _free() {
            leftData.freeRef();
            rightData.freeRef();
            left.freeRef();
            right.freeRef();
        }

        @Override
        public boolean isAlive() {
            for (@Nonnull final Result element : inObj) if (element.isAlive()) {
                return true;
            }
            return false;
        }
    };
}
Also used : JsonObject(com.google.gson.JsonObject) Arrays(java.util.Arrays) CudaMemory(com.simiacryptus.mindseye.lang.cudnn.CudaMemory) jcuda.jcudnn.cudnnReduceTensorDescriptor(jcuda.jcudnn.cudnnReduceTensorDescriptor) jcuda.jcudnn.cudnnReduceTensorOp(jcuda.jcudnn.cudnnReduceTensorOp) Result(com.simiacryptus.mindseye.lang.Result) DataSerializer(com.simiacryptus.mindseye.lang.DataSerializer) Precision(com.simiacryptus.mindseye.lang.cudnn.Precision) Map(java.util.Map) Layer(com.simiacryptus.mindseye.lang.Layer) ReferenceCounting(com.simiacryptus.mindseye.lang.ReferenceCounting) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) CudaResource(com.simiacryptus.mindseye.lang.cudnn.CudaResource) CudaDevice(com.simiacryptus.mindseye.lang.cudnn.CudaDevice) CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) jcuda.jcudnn.cudnnOpTensorOp(jcuda.jcudnn.cudnnOpTensorOp) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) jcuda.jcudnn.cudnnIndicesType(jcuda.jcudnn.cudnnIndicesType) jcuda.jcudnn.cudnnNanPropagation(jcuda.jcudnn.cudnnNanPropagation) jcuda.jcudnn.cudnnReduceTensorIndices(jcuda.jcudnn.cudnnReduceTensorIndices) List(java.util.List) LayerBase(com.simiacryptus.mindseye.lang.LayerBase) Stream(java.util.stream.Stream) CudaSystem(com.simiacryptus.mindseye.lang.cudnn.CudaSystem) TensorList(com.simiacryptus.mindseye.lang.TensorList) MemoryType(com.simiacryptus.mindseye.lang.cudnn.MemoryType) ProductInputsLayer(com.simiacryptus.mindseye.layers.java.ProductInputsLayer) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) jcuda.jcudnn.cudnnOpTensorDescriptor(jcuda.jcudnn.cudnnOpTensorDescriptor) CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) CudaDevice(com.simiacryptus.mindseye.lang.cudnn.CudaDevice) Nonnull(javax.annotation.Nonnull) jcuda.jcudnn.cudnnReduceTensorDescriptor(jcuda.jcudnn.cudnnReduceTensorDescriptor) CudaMemory(com.simiacryptus.mindseye.lang.cudnn.CudaMemory) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) TensorList(com.simiacryptus.mindseye.lang.TensorList) Result(com.simiacryptus.mindseye.lang.Result) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) ReferenceCounting(com.simiacryptus.mindseye.lang.ReferenceCounting) CudaResource(com.simiacryptus.mindseye.lang.cudnn.CudaResource) jcuda.jcudnn.cudnnOpTensorDescriptor(jcuda.jcudnn.cudnnOpTensorDescriptor) Nullable(javax.annotation.Nullable) Nullable(javax.annotation.Nullable)

Example 57 with DeltaSet

use of com.simiacryptus.mindseye.lang.DeltaSet in project MindsEye by SimiaCryptus.

the class SimpleConvolutionLayer method evalAndFree.

@Nullable
@Override
public Result evalAndFree(@Nonnull final Result... inObj) {
    if (!CudaSystem.isEnabled())
        return getCompatibilityLayer().eval(inObj);
    final Result input = inObj[0];
    final TensorList inputData = input.getData();
    @Nonnull final int[] inputSize = inputData.getDimensions();
    @Nonnull final int[] kernelSize = kernel.getDimensions();
    final int[] outputSize = getOutputSize(inputSize);
    final int length = inputData.length();
    kernel.addRef();
    SimpleConvolutionLayer.this.addRef();
    return new Result(CudaSystem.run(gpu -> {
        assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
        @Nullable final CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, false);
        final CudaResource<cudnnFilterDescriptor> filterDescriptor = gpu.newFilterDescriptor(precision, cudnnTensorFormat.CUDNN_TENSOR_NCHW, outputSize[2], inputSize[2], kernelSize[1], kernelSize[0]);
        final CudaResource<cudnnConvolutionDescriptor> convolutionDescriptor = gpu.newConvolutions2dDescriptor(cudnnConvolutionMode.CUDNN_CONVOLUTION, precision, paddingY, paddingX, strideY, strideX, 1, 1);
        final int[] outputDims = IntStream.of(reverse(CudaSystem.getOutputDims(inputTensor.descriptor.getPtr(), filterDescriptor.getPtr(), convolutionDescriptor.getPtr()))).limit(3).toArray();
        final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, length, outputDims[2], outputDims[1], outputDims[0], outputDims[2] * outputDims[1] * outputDims[0], outputDims[1] * outputDims[0], outputDims[0], 1);
        final int forwardAlgorithm = getForwardAlgorithm(gpu, inputTensor, filterDescriptor, convolutionDescriptor, outputDescriptor);
        final CudaMemory forwardWorkspace = gpu.allocateForwardWorkspace(inputTensor.descriptor.getPtr(), filterDescriptor.getPtr(), convolutionDescriptor.getPtr(), outputDescriptor.getPtr(), forwardAlgorithm, 1);
        try {
            assert 0 < kernel.getData().length;
            assert kernelSize[0] * kernelSize[1] * kernelSize[2] == kernel.getData().length;
            @Nonnull CudaMemory filterPtr = getCudaFilter(gpu);
            @Nonnull final CudaMemory outputBuffer = gpu.allocate((long) Tensor.length(outputDims) * length * precision.size, MemoryType.Managed.normalize(), true);
            CudaMemory inputTensorMemory = inputTensor.getMemory(gpu);
            // inputTensorMemory.synchronize();
            CudaSystem.handle(gpu.cudnnConvolutionForward(precision.getPointer(1.0), inputTensor.descriptor.getPtr(), inputTensorMemory.getPtr(), filterDescriptor.getPtr(), filterPtr.getPtr(), convolutionDescriptor.getPtr(), forwardAlgorithm, null == forwardWorkspace ? null : forwardWorkspace.getPtr(), null == forwardWorkspace ? 0 : forwardWorkspace.size, precision.getPointer(0.0), outputDescriptor.getPtr(), outputBuffer.getPtr()));
            assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
            forwardWorkspace.dirty();
            filterPtr.dirty();
            outputBuffer.dirty();
            inputTensorMemory.dirty();
            // inputTensorMemory.synchronize();
            inputTensorMemory.freeRef();
            filterPtr.freeRef();
            outputDescriptor.addRef();
            return CudaTensorList.wrap(CudaTensor.wrap(outputBuffer, outputDescriptor, precision), length, outputDims, precision);
        } catch (@Nonnull final Throwable e) {
            throw new ComponentException(String.format("Error in convolution %s x %s", Arrays.toString(inputSize), Arrays.toString(kernelSize)), e);
        } finally {
            Stream.of(inputTensor, filterDescriptor, outputDescriptor, forwardWorkspace, convolutionDescriptor).forEach(ReferenceCounting::freeRef);
        }
    }, inputData), (@Nonnull final DeltaSet<Layer> buffer, @Nonnull final TensorList delta) -> {
        delta.assertAlive();
        buffer.assertAlive();
        inputData.assertAlive();
        assert delta.length() == length;
        delta.addRef();
        Runnable learnFn = () -> {
            if (!isFrozen()) {
                @Nonnull final Tensor weightGradient = CudaSystem.run(gpu -> {
                    @Nullable final CudaTensor deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, true);
                    delta.freeRef();
                    @Nullable final CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, false);
                    final CudaResource<cudnnFilterDescriptor> filterDescriptor = gpu.newFilterDescriptor(precision, cudnnTensorFormat.CUDNN_TENSOR_NCHW, outputSize[2], inputSize[2], kernelSize[1], kernelSize[0]);
                    final CudaResource<cudnnConvolutionDescriptor> convolutionDescriptor = gpu.newConvolutions2dDescriptor(cudnnConvolutionMode.CUDNN_CONVOLUTION, precision, paddingY, paddingX, strideY, strideX, 1, 1);
                    final int backwardFilterAlgorithm = getBackwardFilterAlgorithm(gpu, deltaTensor, inputTensor, filterDescriptor, convolutionDescriptor);
                    final CudaMemory backwardsFilterWorkSpace = gpu.allocateBackwardFilterWorkspace(inputTensor.descriptor.getPtr(), filterDescriptor.getPtr(), convolutionDescriptor.getPtr(), deltaTensor.descriptor.getPtr(), backwardFilterAlgorithm, 1);
                    @Nonnull CudaMemory filterPtr = gpu.allocate((long) kernel.length() * precision.size, MemoryType.Device, true);
                    try {
                        CudaMemory inputTensorMemory = inputTensor.getMemory(gpu);
                        CudaMemory deltaTensorMemory = deltaTensor.getMemory(gpu, MemoryType.Managed.normalize());
                        // inputTensorMemory.synchronize();
                        CudaSystem.handle(gpu.cudnnConvolutionBackwardFilter(precision.getPointer(1.0), inputTensor.descriptor.getPtr(), inputTensorMemory.getPtr(), deltaTensor.descriptor.getPtr(), deltaTensorMemory.getPtr(), convolutionDescriptor.getPtr(), backwardFilterAlgorithm, backwardsFilterWorkSpace.getPtr(), backwardsFilterWorkSpace.size, precision.getPointer(0.0), filterDescriptor.getPtr(), filterPtr.getPtr()));
                        filterPtr.dirty();
                        deltaTensorMemory.dirty();
                        inputTensorMemory.dirty();
                        backwardsFilterWorkSpace.dirty();
                        // backwardsFilterWorkSpace.synchronize();
                        inputTensorMemory.freeRef();
                        deltaTensorMemory.freeRef();
                        return filterPtr.read(precision, kernel.getDimensions());
                    } catch (@Nonnull final Throwable e) {
                        throw new ComponentException(String.format("Error in convolution %s x %s => %s", Arrays.toString(inputSize), Arrays.toString(kernelSize), Arrays.toString(outputSize)), e);
                    } finally {
                        inputTensor.freeRef();
                        filterPtr.freeRef();
                        deltaTensor.freeRef();
                        Stream.of(filterDescriptor, convolutionDescriptor, backwardsFilterWorkSpace).forEach(ReferenceCounting::freeRef);
                    }
                }, delta);
                buffer.get(SimpleConvolutionLayer.this, kernel.getData()).addInPlace(weightGradient.getData()).freeRef();
                weightGradient.freeRef();
                clearCudaFilters();
            } else {
                delta.freeRef();
            }
        };
        Runnable backpropFn = () -> {
            if (input.isAlive()) {
                final TensorList inputBufferTensors = CudaSystem.run(gpu -> {
                    final CudaDevice.CudaTensorDescriptor inputDescriptor = gpu.newTensorDescriptor(precision, length, inputSize[2], inputSize[1], inputSize[0], inputSize[2] * inputSize[1] * inputSize[0], inputSize[1] * inputSize[0], inputSize[0], 1);
                    final CudaResource<cudnnFilterDescriptor> filterDescriptor = gpu.newFilterDescriptor(precision, cudnnTensorFormat.CUDNN_TENSOR_NCHW, outputSize[2], inputSize[2], kernelSize[1], kernelSize[0]);
                    final CudaResource<cudnnConvolutionDescriptor> convolutionDescriptor = gpu.newConvolutions2dDescriptor(cudnnConvolutionMode.CUDNN_CONVOLUTION, precision, paddingY, paddingX, strideY, strideX, 1, 1);
                    @Nullable final CudaTensor deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, false);
                    delta.freeRef();
                    final int backwardDataAlgorithm = getBackwardDataAlgorithm(gpu, inputDescriptor, filterDescriptor, convolutionDescriptor, deltaTensor);
                    final CudaMemory backwardsDataWorkSpace = gpu.allocateBackwardDataWorkspace(inputDescriptor.getPtr(), filterDescriptor.getPtr(), convolutionDescriptor.getPtr(), deltaTensor.descriptor.getPtr(), backwardDataAlgorithm, 1);
                    @Nonnull final CudaMemory filterPtr = getCudaFilter(gpu);
                    try {
                        @Nonnull final CudaMemory passbackMemory = gpu.allocate((long) Tensor.length(inputData.getDimensions()) * length * precision.size, MemoryType.Managed.normalize(), true);
                        CudaMemory deltaTensorMemory = deltaTensor.getMemory(gpu);
                        // deltaTensorMemory.synchronize();
                        CudaSystem.handle(gpu.cudnnConvolutionBackwardData(precision.getPointer(1.0), filterDescriptor.getPtr(), filterPtr.getPtr(), deltaTensor.descriptor.getPtr(), deltaTensorMemory.getPtr(), convolutionDescriptor.getPtr(), backwardDataAlgorithm, backwardsDataWorkSpace.getPtr(), backwardsDataWorkSpace.size, precision.getPointer(0.0), inputDescriptor.getPtr(), passbackMemory.getPtr()));
                        passbackMemory.dirty();
                        backwardsDataWorkSpace.dirty();
                        deltaTensorMemory.dirty();
                        // deltaTensorMemory.synchronize();
                        filterPtr.dirty();
                        deltaTensorMemory.freeRef();
                        inputDescriptor.addRef();
                        return CudaTensorList.wrap(CudaTensor.wrap(passbackMemory, inputDescriptor, precision), length, inputSize, precision);
                    } catch (@Nonnull final Throwable e) {
                        throw new ComponentException(String.format("Error in convolution %s x %s => %s", Arrays.toString(inputSize), Arrays.toString(kernelSize), Arrays.toString(outputSize)), e);
                    } finally {
                        filterPtr.freeRef();
                        deltaTensor.freeRef();
                        Stream.of(inputDescriptor, filterDescriptor, convolutionDescriptor, backwardsDataWorkSpace).forEach(ReferenceCounting::freeRef);
                    }
                }, delta);
                if (null != inputBufferTensors) {
                    input.accumulate(buffer, inputBufferTensors);
                }
            } else {
                delta.freeRef();
            }
        };
        Stream.of(learnFn, backpropFn).forEach(Runnable::run);
    }) {

        @Override
        public final void accumulate(DeltaSet<Layer> buffer, TensorList delta) {
            getAccumulator().accept(buffer, delta);
        }

        @Override
        protected void _free() {
            kernel.freeRef();
            inputData.freeRef();
            Arrays.stream(inObj).forEach(ReferenceCounting::freeRef);
            SimpleConvolutionLayer.this.freeRef();
        }

        @Override
        public boolean isAlive() {
            return input.isAlive() || !isFrozen();
        }
    };
}
Also used : IntStream(java.util.stream.IntStream) JsonObject(com.google.gson.JsonObject) Coordinate(com.simiacryptus.mindseye.lang.Coordinate) Arrays(java.util.Arrays) CudaMemory(com.simiacryptus.mindseye.lang.cudnn.CudaMemory) LoggerFactory(org.slf4j.LoggerFactory) Tensor(com.simiacryptus.mindseye.lang.Tensor) jcuda.jcudnn.cudnnConvolutionMode(jcuda.jcudnn.cudnnConvolutionMode) Result(com.simiacryptus.mindseye.lang.Result) DataSerializer(com.simiacryptus.mindseye.lang.DataSerializer) JsonElement(com.google.gson.JsonElement) Precision(com.simiacryptus.mindseye.lang.cudnn.Precision) CudnnHandle(com.simiacryptus.mindseye.lang.cudnn.CudnnHandle) Map(java.util.Map) jcuda.jcudnn.cudnnConvolutionDescriptor(jcuda.jcudnn.cudnnConvolutionDescriptor) Layer(com.simiacryptus.mindseye.lang.Layer) ReferenceCounting(com.simiacryptus.mindseye.lang.ReferenceCounting) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) CudaResource(com.simiacryptus.mindseye.lang.cudnn.CudaResource) Util(com.simiacryptus.util.Util) jcuda.jcudnn.cudnnConvolutionBwdDataAlgo(jcuda.jcudnn.cudnnConvolutionBwdDataAlgo) CudaSettings(com.simiacryptus.mindseye.lang.cudnn.CudaSettings) ComponentException(com.simiacryptus.mindseye.lang.ComponentException) Logger(org.slf4j.Logger) CudaDevice(com.simiacryptus.mindseye.lang.cudnn.CudaDevice) CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) jcuda.jcudnn.cudnnFilterDescriptor(jcuda.jcudnn.cudnnFilterDescriptor) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) Collectors(java.util.stream.Collectors) List(java.util.List) LayerBase(com.simiacryptus.mindseye.lang.LayerBase) Stream(java.util.stream.Stream) CudaSystem(com.simiacryptus.mindseye.lang.cudnn.CudaSystem) ToDoubleFunction(java.util.function.ToDoubleFunction) TensorList(com.simiacryptus.mindseye.lang.TensorList) DoubleSupplier(java.util.function.DoubleSupplier) MemoryType(com.simiacryptus.mindseye.lang.cudnn.MemoryType) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) jcuda.jcudnn.cudnnTensorFormat(jcuda.jcudnn.cudnnTensorFormat) CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) Tensor(com.simiacryptus.mindseye.lang.Tensor) CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) Nonnull(javax.annotation.Nonnull) CudaMemory(com.simiacryptus.mindseye.lang.cudnn.CudaMemory) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) TensorList(com.simiacryptus.mindseye.lang.TensorList) Result(com.simiacryptus.mindseye.lang.Result) ReferenceCounting(com.simiacryptus.mindseye.lang.ReferenceCounting) CudaResource(com.simiacryptus.mindseye.lang.cudnn.CudaResource) ComponentException(com.simiacryptus.mindseye.lang.ComponentException) Nullable(javax.annotation.Nullable)

Example 58 with DeltaSet

use of com.simiacryptus.mindseye.lang.DeltaSet in project MindsEye by SimiaCryptus.

the class SimpleConvolutionLayer method getCompatibilityLayer.

/**
 * Gets compatibility layer.
 *
 * @return the compatibility layer
 */
@Nonnull
public Layer getCompatibilityLayer() {
    log.info(String.format("Using compatibility layer for %s", this));
    int bands = (int) Math.sqrt(this.kernel.getDimensions()[2]);
    @Nonnull final com.simiacryptus.mindseye.layers.aparapi.ConvolutionLayer convolutionLayer = new com.simiacryptus.mindseye.layers.aparapi.ConvolutionLayer(this.kernel.getDimensions()[0], this.kernel.getDimensions()[1], this.kernel.getDimensions()[2], true);
    @Nonnull final Tensor tensor = new Tensor(kernel.getDimensions());
    tensor.setByCoord(c -> {
        final int band = c.getCoords()[2];
        final int bandX = band % bands;
        final int bandY = (band - bandX) / bands;
        assert band == bandX + bandY * bands;
        final int bandT = bandY + bandX * bands;
        return kernel.get(c.getCoords()[0], c.getCoords()[1], bandT);
    });
    convolutionLayer.kernel.set(tensor);
    return new LayerBase() {

        @Nonnull
        @Override
        public Result eval(@Nonnull Result... array) {
            Arrays.stream(array).forEach(x -> x.addRef());
            @Nonnull Result result = convolutionLayer.eval(array);
            return new Result(result.getData(), (DeltaSet<Layer> buffer, TensorList data) -> {
                throw new IllegalStateException();
            }) {

                @Override
                protected void _free() {
                    Arrays.stream(array).forEach(x -> x.freeRef());
                }

                @Override
                public boolean isAlive() {
                    return false;
                }
            };
        }

        @Nonnull
        @Override
        public JsonObject getJson(Map<CharSequence, byte[]> resources, DataSerializer dataSerializer) {
            throw new IllegalStateException();
        }

        @Nonnull
        @Override
        public List<double[]> state() {
            throw new IllegalStateException();
        }
    };
}
Also used : Tensor(com.simiacryptus.mindseye.lang.Tensor) CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) Nonnull(javax.annotation.Nonnull) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) TensorList(com.simiacryptus.mindseye.lang.TensorList) Result(com.simiacryptus.mindseye.lang.Result) LayerBase(com.simiacryptus.mindseye.lang.LayerBase) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) DataSerializer(com.simiacryptus.mindseye.lang.DataSerializer) Nonnull(javax.annotation.Nonnull)

Example 59 with DeltaSet

use of com.simiacryptus.mindseye.lang.DeltaSet in project MindsEye by SimiaCryptus.

the class SoftmaxActivationLayer method evalAndFree.

@Nullable
@Override
public Result evalAndFree(@Nonnull final Result... inObj) {
    if (!CudaSystem.isEnabled())
        return getCompatibilityLayer().evalAndFree(inObj);
    final Result inputResult = inObj[0];
    final TensorList inputData = inputResult.getData();
    @Nonnull final int[] inputSize = inputData.getDimensions();
    @Nonnull final int[] outputSize = inputSize;
    final int length = inputData.length();
    final int inputDims = Tensor.length(inputSize);
    try {
        final CudaTensor outPtr = CudaSystem.run(gpu -> {
            @Nullable CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, false);
            final CudaTensor outputTensor;
            if (1 == inputData.currentRefCount() && 1 == inputTensor.currentRefCount()) {
                outputTensor = inputTensor;
                outputTensor.addRef();
            } else {
                @Nonnull final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, length, inputSize[2], inputSize[1], inputSize[0], inputSize[2] * inputSize[1] * inputSize[0], inputSize[1] * inputSize[0], inputSize[0], 1);
                @Nonnull final CudaMemory outputData = gpu.allocate(precision.size * 1l * inputDims * length, MemoryType.Managed.normalize(), true);
                outputTensor = CudaTensor.wrap(outputData, outputDescriptor, precision);
            }
            try {
                CudaMemory inputMemory = inputTensor.getMemory(gpu);
                CudaMemory outputMemory = outputTensor.getMemory(gpu);
                CudaSystem.handle(gpu.cudnnSoftmaxForward(algorithm.code, mode.code, precision.getPointer(1.0), inputTensor.descriptor.getPtr(), inputMemory.getPtr(), precision.getPointer(0.0), outputTensor.descriptor.getPtr(), outputMemory.getPtr()));
                assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
                inputMemory.dirty();
                outputMemory.dirty();
                outputMemory.freeRef();
                inputMemory.freeRef();
                return outputTensor;
            } catch (@Nonnull final Throwable e) {
                throw new ComponentException("Error apply " + Arrays.toString(inputSize), e);
            } finally {
                inputTensor.freeRef();
            }
        }, inputData);
        return new Result(CudaTensorList.create(outPtr, length, outputSize, precision), (@Nonnull final DeltaSet<Layer> buffer, @Nonnull final TensorList delta) -> {
            if (inputResult.isAlive()) {
                final TensorList data = CudaSystem.run(gpu -> {
                    @Nullable CudaTensor inputTensor;
                    synchronized (gpu) {
                        inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, true);
                    }
                    @Nullable CudaTensor deltaTensor;
                    synchronized (gpu) {
                        deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, true);
                    }
                    outPtr.addRef();
                    CudaTensor localOut = outPtr.getDenseAndFree(gpu);
                    delta.freeRef();
                    CudaTensor passbackTensor;
                    passbackTensor = CudaTensor.wrap(gpu.allocate((long) Tensor.length(inputSize) * length * precision.size, MemoryType.Managed.normalize(), false), gpu.newTensorDescriptor(precision, delta.length(), inputSize[2], inputSize[1], inputSize[0], inputSize[2] * inputSize[1] * inputSize[0], inputSize[1] * inputSize[0], inputSize[0], 1), precision);
                    try {
                        CudaMemory localOutMemory = localOut.getMemory(gpu);
                        CudaMemory deltaTensorMemory = deltaTensor.getMemory(gpu);
                        CudaMemory inputMemory = inputTensor.getMemory(gpu);
                        CudaMemory passbackMemory = passbackTensor.getMemory(gpu);
                        CudaSystem.handle(gpu.cudnnSoftmaxBackward(algorithm.code, mode.code, precision.getPointer(1.0), localOut.descriptor.getPtr(), localOutMemory.getPtr(), deltaTensor.descriptor.getPtr(), deltaTensorMemory.getPtr(), precision.getPointer(0.0), passbackTensor.descriptor.getPtr(), passbackMemory.getPtr()));
                        localOutMemory.dirty();
                        deltaTensorMemory.dirty();
                        passbackMemory.dirty();
                        localOutMemory.freeRef();
                        deltaTensorMemory.freeRef();
                        inputMemory.freeRef();
                        passbackMemory.freeRef();
                    } catch (@Nonnull final Throwable e) {
                        throw new ComponentException("Error apply " + Arrays.toString(inputSize), e);
                    } finally {
                        localOut.freeRef();
                        inputTensor.freeRef();
                        deltaTensor.freeRef();
                    }
                    return CudaTensorList.wrap(passbackTensor, length, inputSize, precision);
                }, delta);
                inputResult.accumulate(buffer, data);
            } else {
                delta.freeRef();
            }
        }) {

            @Override
            public final void accumulate(DeltaSet<Layer> buffer, TensorList delta) {
                getAccumulator().accept(buffer, delta);
            }

            @Override
            protected void _free() {
                inputData.freeRef();
                outPtr.freeRef();
                inputResult.freeRef();
            }

            @Override
            public boolean isAlive() {
                return inputResult.isAlive() || !isFrozen();
            }
        };
    } catch (@Nonnull final Throwable e) {
        throw new ComponentException("Error apply image res " + Arrays.toString(inputSize), e);
    }
}
Also used : CudaTensor(com.simiacryptus.mindseye.lang.cudnn.CudaTensor) CudaDevice(com.simiacryptus.mindseye.lang.cudnn.CudaDevice) Nonnull(javax.annotation.Nonnull) CudaMemory(com.simiacryptus.mindseye.lang.cudnn.CudaMemory) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) CudaTensorList(com.simiacryptus.mindseye.lang.cudnn.CudaTensorList) TensorList(com.simiacryptus.mindseye.lang.TensorList) Result(com.simiacryptus.mindseye.lang.Result) ComponentException(com.simiacryptus.mindseye.lang.ComponentException) Nullable(javax.annotation.Nullable) Nullable(javax.annotation.Nullable)

Example 60 with DeltaSet

use of com.simiacryptus.mindseye.lang.DeltaSet in project MindsEye by SimiaCryptus.

the class SumInputsLayer method evalAndFree.

@Nullable
@Override
public Result evalAndFree(@Nonnull final Result... inObj) {
    @Nonnull final int[] dimensions = inObj[0].getData().getDimensions();
    if (3 != dimensions.length) {
        throw new IllegalArgumentException("dimensions=" + Arrays.toString(dimensions));
    }
    for (int i = 1; i < inObj.length; i++) {
        if (Tensor.length(dimensions) != Tensor.length(inObj[i].getData().getDimensions())) {
            throw new IllegalArgumentException(Arrays.toString(dimensions) + " != " + Arrays.toString(inObj[i].getData().getDimensions()));
        }
    }
    if (!CudaSystem.isEnabled())
        return getCompatibilityLayer().evalAndFree(inObj);
    Stream<TensorList> tensorListStream = Arrays.stream(inObj).map(x -> x.getData());
    if (!CoreSettings.INSTANCE.isSingleThreaded() && parallel)
        tensorListStream = tensorListStream.parallel();
    return new Result(tensorListStream.reduce((leftData, rightData) -> CudaSystem.run(gpu -> {
        return gpu.addAndFree(precision, leftData, rightData);
    }, leftData, rightData)).get(), (@Nonnull final DeltaSet<Layer> buffer, @Nonnull final TensorList delta) -> {
        @Nonnull Stream<Result> deltaStream = Arrays.stream(inObj);
        if (!CoreSettings.INSTANCE.isSingleThreaded() && parallel)
            deltaStream = deltaStream.parallel();
        deltaStream.filter(Result::isAlive).forEach(obj -> {
            delta.addRef();
            obj.accumulate(buffer, delta);
        });
    }) {

        @Override
        protected void _free() {
            Arrays.stream(inObj).forEach(x -> x.freeRef());
        }

        @Override
        public boolean isAlive() {
            for (@Nonnull final Result element : inObj) if (element.isAlive()) {
                return true;
            }
            return false;
        }
    };
}
Also used : JsonObject(com.google.gson.JsonObject) Arrays(java.util.Arrays) Tensor(com.simiacryptus.mindseye.lang.Tensor) CoreSettings(com.simiacryptus.mindseye.lang.CoreSettings) Result(com.simiacryptus.mindseye.lang.Result) DataSerializer(com.simiacryptus.mindseye.lang.DataSerializer) Precision(com.simiacryptus.mindseye.lang.cudnn.Precision) List(java.util.List) LayerBase(com.simiacryptus.mindseye.lang.LayerBase) Stream(java.util.stream.Stream) CudaSystem(com.simiacryptus.mindseye.lang.cudnn.CudaSystem) TensorList(com.simiacryptus.mindseye.lang.TensorList) Map(java.util.Map) Layer(com.simiacryptus.mindseye.lang.Layer) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Nonnull(javax.annotation.Nonnull) DeltaSet(com.simiacryptus.mindseye.lang.DeltaSet) TensorList(com.simiacryptus.mindseye.lang.TensorList) Result(com.simiacryptus.mindseye.lang.Result) Nullable(javax.annotation.Nullable)

Aggregations

DeltaSet (com.simiacryptus.mindseye.lang.DeltaSet)98 Nonnull (javax.annotation.Nonnull)98 Result (com.simiacryptus.mindseye.lang.Result)90 Nullable (javax.annotation.Nullable)88 Layer (com.simiacryptus.mindseye.lang.Layer)86 TensorList (com.simiacryptus.mindseye.lang.TensorList)86 Arrays (java.util.Arrays)77 List (java.util.List)75 Tensor (com.simiacryptus.mindseye.lang.Tensor)73 Map (java.util.Map)66 IntStream (java.util.stream.IntStream)65 TensorArray (com.simiacryptus.mindseye.lang.TensorArray)64 JsonObject (com.google.gson.JsonObject)62 DataSerializer (com.simiacryptus.mindseye.lang.DataSerializer)61 LayerBase (com.simiacryptus.mindseye.lang.LayerBase)60 Logger (org.slf4j.Logger)47 LoggerFactory (org.slf4j.LoggerFactory)47 ReferenceCounting (com.simiacryptus.mindseye.lang.ReferenceCounting)23 CudaTensor (com.simiacryptus.mindseye.lang.cudnn.CudaTensor)22 CudaTensorList (com.simiacryptus.mindseye.lang.cudnn.CudaTensorList)22