use of com.simiacryptus.mindseye.lang.ComponentException in project MindsEye by SimiaCryptus.
the class ActivationLayer method evalAndFree.
@Nullable
@Override
public Result evalAndFree(@Nonnull final Result... inObj) {
if (!CudaSystem.isEnabled())
return getCompatibilityLayer().evalAndFree(inObj);
// assert Arrays.stream(inObj).flatMapToDouble(input->input.data.stream().flatMapToDouble(x-> Arrays.stream(x.getData()))).allMatch(v->Double.isFinite(v));
final Result inputResult = inObj[0];
final TensorList inputData = inputResult.getData();
@Nonnull final int[] inputSize = inputData.getDimensions();
@Nonnull final int[] outputSize = inputSize;
final int length = inputData.length();
final int inputDims = Tensor.length(inputSize);
try {
final CudaTensor outPtr = CudaSystem.run(gpu -> {
@Nullable final CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, false);
final CudaTensor outputTensor;
if (1 == inputData.currentRefCount() && 1 == inputTensor.currentRefCount() && (!inputResult.isAlive() || mode == Mode.RELU.id)) {
inputTensor.addRef();
outputTensor = inputTensor;
} else {
@Nonnull final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, length, inputSize[2], inputSize[1], inputSize[0], inputSize[2] * inputSize[1] * inputSize[0], inputSize[1] * inputSize[0], inputSize[0], 1);
@Nonnull final CudaMemory outputData = gpu.allocate((long) precision.size * inputDims * length, MemoryType.Managed.normalize(), true);
outputTensor = CudaTensor.wrap(outputData, outputDescriptor, precision);
}
@Nonnull final CudaResource<cudnnActivationDescriptor> activationDesc = gpu.newActivationDescriptor(mode, cudnnNanPropagation.CUDNN_NOT_PROPAGATE_NAN, 0);
try {
CudaMemory memory = inputTensor.getMemory(gpu);
CudaMemory tensorMemory = outputTensor.getMemory(gpu);
CudaSystem.handle(gpu.cudnnActivationForward(activationDesc.getPtr(), precision.getPointer(1.0), inputTensor.descriptor.getPtr(), memory.getPtr(), precision.getPointer(0.0), outputTensor.descriptor.getPtr(), tensorMemory.getPtr()));
assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
memory.dirty();
tensorMemory.dirty();
tensorMemory.freeRef();
memory.freeRef();
return outputTensor;
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error apply " + Arrays.toString(inputSize), e);
} finally {
activationDesc.freeRef();
inputTensor.freeRef();
}
}, inputData);
return new Result(CudaTensorList.create(outPtr, length, outputSize, precision), (@Nonnull final DeltaSet<Layer> buffer, @Nonnull final TensorList delta) -> {
if (inputResult.isAlive()) {
final TensorList data = CudaSystem.run(gpu -> {
@Nullable CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, true);
@Nullable CudaTensor deltaTensor = gpu.getTensor(delta, precision, MemoryType.Device, true);
assert length == delta.length();
CudaTensor localOut = outPtr.getDense(gpu);
delta.freeRef();
CudaTensor passbackTensor;
// if (sameStrides(deltaTensor.descriptor, inputTensor.descriptor)) {
// passbackTensor = deltaTensor;
// passbackTensor.addRef();
// }
// else {
// passbackTensor = deltaTensor.getDense(gpu);
// inputTensor = inputTensor.getDenseAndFree(gpu);
// }
passbackTensor = CudaTensor.wrap(gpu.allocate((long) Tensor.length(inputSize) * length * precision.size, MemoryType.Managed.normalize(), false), gpu.newTensorDescriptor(precision, length, inputSize[2], inputSize[1], inputSize[0], inputSize[2] * inputSize[1] * inputSize[0], inputSize[1] * inputSize[0], inputSize[0], 1), precision);
@Nonnull final CudaResource<cudnnActivationDescriptor> activationDesc = gpu.newActivationDescriptor(mode, cudnnNanPropagation.CUDNN_NOT_PROPAGATE_NAN, 0);
try {
CudaMemory localOutMemory = localOut.getMemory(gpu);
CudaMemory deltaTensorMemory = deltaTensor.getMemory(gpu);
CudaMemory inputTensorMemory = inputTensor.getMemory(gpu);
CudaMemory passbackTensorMemory = passbackTensor.getMemory(gpu);
CudaSystem.handle(gpu.cudnnActivationBackward(activationDesc.getPtr(), precision.getPointer(1.0), localOut.descriptor.getPtr(), localOutMemory.getPtr(), deltaTensor.descriptor.getPtr(), deltaTensorMemory.getPtr(), inputTensor.descriptor.getPtr(), inputTensorMemory.getPtr(), precision.getPointer(0.0), passbackTensor.descriptor.getPtr(), passbackTensorMemory.getPtr()));
assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
localOutMemory.dirty();
deltaTensorMemory.dirty();
inputTensorMemory.dirty();
passbackTensorMemory.dirty();
localOutMemory.freeRef();
deltaTensorMemory.freeRef();
inputTensorMemory.freeRef();
passbackTensorMemory.freeRef();
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error apply " + Arrays.toString(inputSize), e);
} finally {
localOut.freeRef();
inputTensor.freeRef();
deltaTensor.freeRef();
activationDesc.freeRef();
}
return CudaTensorList.wrap(passbackTensor, length, inputSize, precision);
}, delta);
inputResult.accumulate(buffer, data);
} else {
delta.freeRef();
}
}) {
@Override
public final void accumulate(DeltaSet<Layer> buffer, TensorList delta) {
getAccumulator().accept(buffer, delta);
}
@Override
protected void _free() {
inputData.freeRef();
outPtr.freeRef();
inputResult.freeRef();
}
@Override
public boolean isAlive() {
return inputResult.isAlive() || !isFrozen();
}
};
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error apply image res " + Arrays.toString(inputSize), e);
}
}
use of com.simiacryptus.mindseye.lang.ComponentException in project MindsEye by SimiaCryptus.
the class PoolingLayer method evalAndFree.
@Nullable
@Override
public Result evalAndFree(@Nonnull final Result... inObj) {
if (!CudaSystem.isEnabled())
return getCompatibilityLayer().evalAndFree(inObj);
final int poolDims = 2;
@Nonnull final int[] windowSize = { windowX, windowY };
@Nonnull final int[] padding = { paddingX, paddingY };
@Nonnull final int[] stride = { strideX, strideY };
final Result input = inObj[0];
final TensorList inputData = input.getData();
@Nonnull final int[] inputSize = inputData.getDimensions();
final int length = inputData.length();
final int inputDims = Tensor.length(inputSize);
@Nonnull final int[] outputSize = new int[4];
final CudaTensor outputData = CudaSystem.run(gpu -> {
try {
gpu.initThread();
@Nonnull final CudaResource<cudnnPoolingDescriptor> poolingDesc = gpu.createPoolingDescriptor(mode.id, poolDims, windowSize, padding, stride);
@Nullable final CudaTensor inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, false);
CudaSystem.handle(CudaSystem.cudnnGetPoolingNdForwardOutputDim(poolingDesc.getPtr(), inputTensor.descriptor.getPtr(), 4, outputSize));
assert inputSize[2] == outputSize[1];
@Nonnull final CudaDevice.CudaTensorDescriptor outputDescriptor = gpu.newTensorDescriptor(precision, outputSize[0], outputSize[1], outputSize[2], outputSize[3], outputSize[1] * outputSize[2] * outputSize[3], outputSize[2] * outputSize[3], outputSize[3], 1);
@Nonnull final CudaMemory outputTensor = gpu.allocate((long) precision.size * Tensor.length(outputSize), MemoryType.Managed.normalize(), true);
CudaMemory inputDataMemory = inputTensor.getMemory(gpu);
CudaSystem.handle(gpu.cudnnPoolingForward(poolingDesc.getPtr(), precision.getPointer(alpha), inputTensor.descriptor.getPtr(), inputDataMemory.getPtr(), precision.getPointer(0.0), outputDescriptor.getPtr(), outputTensor.getPtr()));
assert CudaDevice.isThreadDeviceId(gpu.getDeviceId());
inputDataMemory.dirty();
outputTensor.dirty();
Stream.<ReferenceCounting>of(inputTensor, poolingDesc, inputDataMemory).forEach(ReferenceCounting::freeRef);
return CudaTensor.wrap(outputTensor, outputDescriptor, precision);
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error", e);
}
}, inputData);
return new Result(CudaTensorList.create(outputData, length, new int[] { outputSize[3], outputSize[2], outputSize[1] }, precision), (@Nonnull final DeltaSet<Layer> buffer, @Nonnull final TensorList error) -> {
assert error.length() == inputData.length();
if (input.isAlive()) {
TensorList data = CudaSystem.run(gpu -> {
@Nonnull final CudaDevice.CudaTensorDescriptor passbackDescriptor = gpu.newTensorDescriptor(precision, length, inputSize[2], inputSize[1], inputSize[0], inputSize[2] * inputSize[1] * inputSize[0], inputSize[1] * inputSize[0], inputSize[0], 1);
@Nonnull final CudaResource<cudnnPoolingDescriptor> poolingDesc = gpu.createPoolingDescriptor(mode.id, poolDims, windowSize, padding, stride);
@Nullable final CudaTensor inputTensor;
synchronized (gpu) {
inputTensor = gpu.getTensor(inputData, precision, MemoryType.Device, true);
}
@Nullable final CudaTensor errorPtr;
synchronized (gpu) {
errorPtr = gpu.getTensor(error, precision, MemoryType.Device, true);
}
@Nonnull final CudaMemory passbackBuffer = gpu.allocate((long) inputDims * precision.size * length, MemoryType.Managed.normalize(), true);
CudaMemory outputDataMemory = outputData.getMemory(gpu);
CudaMemory errorPtrMemory = errorPtr.getMemory(gpu);
CudaMemory inputDataMemory = inputTensor.getMemory(gpu);
CudaSystem.handle(gpu.cudnnPoolingBackward(poolingDesc.getPtr(), precision.getPointer(this.alpha), outputData.descriptor.getPtr(), outputDataMemory.getPtr(), errorPtr.descriptor.getPtr(), errorPtrMemory.getPtr(), inputTensor.descriptor.getPtr(), inputDataMemory.getPtr(), precision.getPointer(0.0), passbackDescriptor.getPtr(), passbackBuffer.getPtr()));
outputDataMemory.dirty();
errorPtrMemory.dirty();
inputDataMemory.dirty();
passbackBuffer.dirty();
Stream.<ReferenceCounting>of(errorPtr, inputTensor, poolingDesc, outputDataMemory, errorPtrMemory, inputDataMemory).forEach(ReferenceCounting::freeRef);
return CudaTensorList.wrap(CudaTensor.wrap(passbackBuffer, passbackDescriptor, precision), length, inputSize, precision);
}, error);
input.accumulate(buffer, data);
}
}) {
@Override
protected void _free() {
Arrays.stream(inObj).forEach(nnResult -> nnResult.freeRef());
inputData.freeRef();
outputData.freeRef();
}
@Override
public boolean isAlive() {
return input.isAlive() || !isFrozen();
}
};
}
use of com.simiacryptus.mindseye.lang.ComponentException in project MindsEye by SimiaCryptus.
the class ConvolutionController method gradient.
private void gradient(@Nonnull final double[] input, @Nonnull final double[] weights, final int weightSize, @Nonnull final double[] output) {
assert 0 < input.length;
assert 0 < weights.length;
assert 0 < output.length;
OpenCL.devicePool.apply(device -> {
try {
synchronized (ConvolutionController.kernelTask) {
ConvolutionController.kernelTask.input = input;
ConvolutionController.kernelTask.weights = weights;
ConvolutionController.kernelTask.output = output;
ConvolutionController.kernelTask.outputSize = outputSize;
ConvolutionController.kernelTask.inputSize = inputSize;
ConvolutionController.kernelTask.kernelSize = kernelSize;
ConvolutionController.kernelTask.weightSize = weightSize;
ConvolutionController.kernelTask.paralellism = weights.length / weightSize;
ConvolutionController.kernelTask.kernelOffset = new int[] { paddingY == null ? (kernelSize[1] - 1) / 2 : paddingY, paddingX == null ? (kernelSize[0] - 1) / 2 : paddingX };
ConvolutionController.kernelTask.setExplicit(true);
ConvolutionController.kernelTask.put(ConvolutionController.convolveTask.kernelOffset);
ConvolutionController.kernelTask.put(ConvolutionController.kernelTask.outputSize);
ConvolutionController.kernelTask.put(ConvolutionController.kernelTask.inputSize);
ConvolutionController.kernelTask.put(ConvolutionController.kernelTask.kernelSize);
ConvolutionController.kernelTask.put(ConvolutionController.kernelTask.input);
ConvolutionController.kernelTask.put(ConvolutionController.kernelTask.output);
ConvolutionController.kernelTask.exe(device);
ConvolutionController.kernelTask.get(ConvolutionController.kernelTask.weights);
ConvolutionController.kernelTask.input = null;
ConvolutionController.kernelTask.weights = null;
ConvolutionController.kernelTask.output = null;
ConvolutionController.kernelTask.outputSize = null;
ConvolutionController.kernelTask.inputSize = null;
ConvolutionController.kernelTask.kernelSize = null;
}
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error apply " + this, e);
}
});
}
use of com.simiacryptus.mindseye.lang.ComponentException in project MindsEye by SimiaCryptus.
the class ConvolutionController method backprop.
/**
* Backprop.
*
* @param input the input
* @param weights the weights
* @param output the output
*/
public void backprop(@Nonnull final double[][] input, @Nonnull final double[] weights, @Nonnull final double[][] output) {
final int length = input.length;
assert length == output.length;
final int inLength = input[0].length;
final int outLength = output[0].length;
final int inputsPerRun = Math.min(Math.floorDiv(ConvolutionController.MAX_BUFFER_SIZE, inLength), length);
final int runs = length / inputsPerRun;
final int leftover = length - runs * inputsPerRun;
OpenCL.devicePool.apply(device -> {
try {
synchronized (ConvolutionController.backpropTask) {
assert 0 < weights.length;
assert kernelSize[0] * kernelSize[1] * kernelSize[2] == weights.length;
ConvolutionController.backpropTask.setExplicit(true);
ConvolutionController.backpropTask.weights = weights;
ConvolutionController.backpropTask.put(ConvolutionController.backpropTask.weights);
ConvolutionController.backpropTask.kernelSize = kernelSize;
ConvolutionController.backpropTask.put(ConvolutionController.backpropTask.kernelSize);
ConvolutionController.backpropTask.kernelOffset = new int[] { null == paddingY ? (kernelSize[1] - 1) / 2 : paddingY, null == paddingX ? (kernelSize[0] - 1) / 2 : paddingX };
ConvolutionController.backpropTask.put(ConvolutionController.convolveTask.kernelOffset);
@Nullable double[] inputBuffer = null;
@Nullable double[] outputBuffer = null;
for (int run = 0; run < runs; run++) {
final int currentIndexOffset = run * inputsPerRun;
final int currentNumItems = run < run - 1 ? inputsPerRun : leftover == 0 ? inputsPerRun : leftover;
if (null == inputBuffer || inputBuffer.length != inLength * currentNumItems) {
if (null != inputBuffer)
RecycleBin.DOUBLES.recycle(inputBuffer, inputBuffer.length);
inputBuffer = RecycleBin.DOUBLES.obtain(inLength * currentNumItems);
}
if (null == outputBuffer || outputBuffer.length != outLength * currentNumItems) {
if (null != outputBuffer)
RecycleBin.DOUBLES.recycle(outputBuffer, outputBuffer.length);
outputBuffer = RecycleBin.DOUBLES.obtain(outLength * currentNumItems);
}
for (int i = 0; i < currentNumItems; i++) {
assert outLength == output[currentIndexOffset + i].length;
System.arraycopy(output[currentIndexOffset + i], 0, outputBuffer, i * outLength, outLength);
}
assert 0 < inputBuffer.length;
assert 0 < outputBuffer.length;
ConvolutionController.backpropTask.input = inputBuffer;
ConvolutionController.backpropTask.output = outputBuffer;
ConvolutionController.backpropTask.outputSize = outputSize;
ConvolutionController.backpropTask.inputSize = inputSize;
ConvolutionController.backpropTask.put(ConvolutionController.backpropTask.outputSize);
ConvolutionController.backpropTask.put(ConvolutionController.backpropTask.inputSize);
ConvolutionController.backpropTask.put(ConvolutionController.backpropTask.output);
ConvolutionController.backpropTask.exe(device);
ConvolutionController.backpropTask.get(ConvolutionController.backpropTask.input);
ConvolutionController.backpropTask.input = null;
ConvolutionController.backpropTask.output = null;
ConvolutionController.backpropTask.outputSize = null;
ConvolutionController.backpropTask.inputSize = null;
for (int i = 0; i < currentNumItems; i++) {
assert inLength == input[currentIndexOffset + i].length;
System.arraycopy(inputBuffer, i * inLength, input[currentIndexOffset + i], 0, inLength);
}
}
RecycleBin.DOUBLES.recycle(inputBuffer, inputBuffer.length);
RecycleBin.DOUBLES.recycle(outputBuffer, outputBuffer.length);
ConvolutionController.backpropTask.kernelSize = null;
ConvolutionController.backpropTask.weights = null;
}
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error apply " + this, e);
}
});
}
use of com.simiacryptus.mindseye.lang.ComponentException in project MindsEye by SimiaCryptus.
the class ConvolutionController method convolve.
/**
* Convolve.
*
* @param input the input
* @param weights the weights
* @param output the output
*/
public void convolve(@Nonnull final double[][] input, @Nonnull final double[] weights, @Nonnull final double[][] output) {
final int length = input.length;
assert length == output.length;
final int inLength = input[0].length;
final int outLength = output[0].length;
final int inputsPerRun = Math.min(Math.floorDiv(ConvolutionController.MAX_BUFFER_SIZE, inLength), length);
assert 0 < inputsPerRun : "Requested buffer is over max of " + ConvolutionController.MAX_BUFFER_SIZE;
final int runs = length / inputsPerRun;
final int leftover = length - runs * inputsPerRun;
OpenCL.devicePool.apply(device -> {
try {
synchronized (ConvolutionController.convolveTask) {
assert null != weights;
assert 0 < weights.length;
ConvolutionController.convolveTask.setExplicit(true);
ConvolutionController.convolveTask.weights = weights;
ConvolutionController.convolveTask.put(ConvolutionController.convolveTask.weights);
ConvolutionController.convolveTask.kernelSize = kernelSize;
ConvolutionController.convolveTask.kernelOffset = new int[] { null == paddingY ? (kernelSize[1] - 1) / 2 : paddingY, null == paddingX ? (kernelSize[0] - 1) / 2 : paddingX };
ConvolutionController.convolveTask.put(ConvolutionController.convolveTask.kernelOffset);
ConvolutionController.convolveTask.put(ConvolutionController.convolveTask.kernelSize);
@Nullable double[] inputBuffer = null;
@Nullable double[] outputBuffer = null;
for (int run = 0; run <= runs; run++) {
final int currentIndexOffset = run * inputsPerRun;
final int currentNumItems = run < runs ? inputsPerRun : leftover;
if (0 == currentNumItems) {
continue;
}
if (null == inputBuffer || inputBuffer.length != inLength * currentNumItems) {
if (null != inputBuffer)
RecycleBin.DOUBLES.recycle(inputBuffer, inputBuffer.length);
inputBuffer = RecycleBin.DOUBLES.obtain(inLength * currentNumItems);
}
if (null == outputBuffer || outputBuffer.length != outLength * currentNumItems) {
if (null != outputBuffer)
RecycleBin.DOUBLES.recycle(outputBuffer, outputBuffer.length);
outputBuffer = RecycleBin.DOUBLES.obtain(outLength * currentNumItems);
}
for (int i = 0; i < currentNumItems; i++) {
assert inLength == input[currentIndexOffset + i].length;
System.arraycopy(input[currentIndexOffset + i], 0, inputBuffer, i * inLength, inLength);
}
assert 0 < inputBuffer.length;
assert 0 < outputBuffer.length;
ConvolutionController.convolveTask.input = inputBuffer;
ConvolutionController.convolveTask.output = outputBuffer;
ConvolutionController.convolveTask.outputSize = outputSize;
ConvolutionController.convolveTask.inputSize = inputSize;
ConvolutionController.convolveTask.put(ConvolutionController.convolveTask.outputSize);
ConvolutionController.convolveTask.put(ConvolutionController.convolveTask.inputSize);
ConvolutionController.convolveTask.put(ConvolutionController.convolveTask.input);
ConvolutionController.convolveTask.exe(device);
ConvolutionController.convolveTask.get(ConvolutionController.convolveTask.output);
ConvolutionController.convolveTask.input = null;
ConvolutionController.convolveTask.output = null;
ConvolutionController.convolveTask.outputSize = null;
ConvolutionController.convolveTask.inputSize = null;
for (int i = 0; i < currentNumItems; i++) {
assert outLength == output[currentIndexOffset + i].length;
System.arraycopy(outputBuffer, i * outLength, output[currentIndexOffset + i], 0, outLength);
}
}
RecycleBin.DOUBLES.recycle(inputBuffer, inputBuffer.length);
RecycleBin.DOUBLES.recycle(outputBuffer, outputBuffer.length);
ConvolutionController.convolveTask.kernelSize = null;
ConvolutionController.convolveTask.weights = null;
}
} catch (@Nonnull final Throwable e) {
throw new ComponentException("Error apply " + this, e);
}
});
}
Aggregations