use of org.deeplearning4j.exception.DL4JInvalidInputException in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readWord2VecModel.
/**
* This method
* 1) Binary model, either compressed or not. Like well-known Google Model
* 2) Popular CSV word2vec text format
* 3) DL4j compressed format
*
* Please note: if extended data isn't available, only weights will be loaded instead.
*
* @param file
* @param extendedModel if TRUE, we'll try to load HS states & Huffman tree info, if FALSE, only weights will be loaded
* @return
*/
public static Word2Vec readWord2VecModel(@NonNull File file, boolean extendedModel) {
InMemoryLookupTable<VocabWord> lookupTable = new InMemoryLookupTable<>();
AbstractCache<VocabWord> vocabCache = new AbstractCache<>();
Word2Vec vec;
INDArray syn0 = null;
VectorsConfiguration configuration = new VectorsConfiguration();
if (!file.exists() || !file.isFile())
throw new ND4JIllegalStateException("File [" + file.getAbsolutePath() + "] doesn't exist");
int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(false);
Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);
// try to load zip format
try {
if (extendedModel) {
log.debug("Trying full model restoration...");
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(true);
Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
return readWord2Vec(file);
} else {
log.debug("Trying simplified model restoration...");
File tmpFileSyn0 = File.createTempFile("word2vec", "syn");
File tmpFileConfig = File.createTempFile("word2vec", "config");
// we don't need full model, so we go directly to syn0 file
ZipFile zipFile = new ZipFile(file);
ZipEntry syn = zipFile.getEntry("syn0.txt");
InputStream stream = zipFile.getInputStream(syn);
Files.copy(stream, Paths.get(tmpFileSyn0.getAbsolutePath()), StandardCopyOption.REPLACE_EXISTING);
// now we're restoring configuration saved earlier
ZipEntry config = zipFile.getEntry("config.json");
if (config != null) {
stream = zipFile.getInputStream(config);
StringBuilder builder = new StringBuilder();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
String line;
while ((line = reader.readLine()) != null) {
builder.append(line);
}
}
configuration = VectorsConfiguration.fromJson(builder.toString().trim());
}
ZipEntry ve = zipFile.getEntry("frequencies.txt");
if (ve != null) {
stream = zipFile.getInputStream(ve);
AtomicInteger cnt = new AtomicInteger(0);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
String line;
while ((line = reader.readLine()) != null) {
String[] split = line.split(" ");
VocabWord word = new VocabWord(Double.valueOf(split[1]), decodeB64(split[0]));
word.setIndex(cnt.getAndIncrement());
word.incrementSequencesCount(Long.valueOf(split[2]));
vocabCache.addToken(word);
vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
Nd4j.getMemoryManager().invokeGcOccasionally();
}
}
}
List<INDArray> rows = new ArrayList<>();
// basically read up everything, call vstacl and then return model
try (Reader reader = new CSVReader(tmpFileSyn0)) {
AtomicInteger cnt = new AtomicInteger(0);
while (reader.hasNext()) {
Pair<VocabWord, float[]> pair = reader.next();
VocabWord word = pair.getFirst();
INDArray vector = Nd4j.create(pair.getSecond());
if (ve != null) {
if (syn0 == null)
syn0 = Nd4j.create(vocabCache.numWords(), vector.length());
syn0.getRow(cnt.getAndIncrement()).assign(vector);
} else {
rows.add(vector);
vocabCache.addToken(word);
vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
}
Nd4j.getMemoryManager().invokeGcOccasionally();
}
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(true);
Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
}
if (syn0 == null && vocabCache.numWords() > 0)
syn0 = Nd4j.vstack(rows);
if (syn0 == null) {
log.error("Can't build syn0 table");
throw new DL4JInvalidInputException("Can't build syn0 table");
}
lookupTable = new InMemoryLookupTable.Builder<VocabWord>().cache(vocabCache).vectorLength(syn0.columns()).useHierarchicSoftmax(false).useAdaGrad(false).build();
lookupTable.setSyn0(syn0);
try {
tmpFileSyn0.delete();
tmpFileConfig.delete();
} catch (Exception e) {
//
}
}
} catch (Exception e) {
// let's try to load this file as csv file
try {
log.debug("Trying CSV model restoration...");
Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(file);
lookupTable = pair.getFirst();
vocabCache = (AbstractCache<VocabWord>) pair.getSecond();
} catch (Exception ex) {
// we fallback to trying binary model instead
try {
log.debug("Trying binary model restoration...");
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(true);
Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
vec = loadGoogleModel(file, true, true);
return vec;
} catch (Exception ey) {
// try to load without linebreaks
try {
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(true);
Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
vec = loadGoogleModel(file, true, false);
return vec;
} catch (Exception ez) {
throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly");
}
}
}
}
Word2Vec.Builder builder = new Word2Vec.Builder(configuration).lookupTable(lookupTable).useAdaGrad(false).vocabCache(vocabCache).layerSize(lookupTable.layerSize()).useHierarchicSoftmax(false).resetModel(false);
/*
Trying to restore TokenizerFactory & TokenPreProcessor
*/
TokenizerFactory factory = getTokenizerFactory(configuration);
if (factory != null)
builder.tokenizerFactory(factory);
vec = builder.build();
return vec;
}
use of org.deeplearning4j.exception.DL4JInvalidInputException in project deeplearning4j by deeplearning4j.
the class RecordReaderDataSetIterator method getDataSet.
private DataSet getDataSet(List<Writable> record) {
List<Writable> currList;
if (record instanceof List)
currList = record;
else
currList = new ArrayList<>(record);
//allow people to specify label index as -1 and infer the last possible label
if (numPossibleLabels >= 1 && labelIndex < 0) {
labelIndex = record.size() - 1;
}
INDArray label = null;
INDArray featureVector = null;
int featureCount = 0;
int labelCount = 0;
//no labels
if (currList.size() == 2 && currList.get(1) instanceof NDArrayWritable && currList.get(0) instanceof NDArrayWritable && currList.get(0) == currList.get(1)) {
NDArrayWritable writable = (NDArrayWritable) currList.get(0);
return new DataSet(writable.get(), writable.get());
}
if (currList.size() == 2 && currList.get(0) instanceof NDArrayWritable) {
if (!regression) {
label = FeatureUtil.toOutcomeVector((int) Double.parseDouble(currList.get(1).toString()), numPossibleLabels);
} else {
if (currList.get(1) instanceof NDArrayWritable) {
label = ((NDArrayWritable) currList.get(1)).get();
} else {
label = Nd4j.scalar(currList.get(1).toDouble());
}
}
NDArrayWritable ndArrayWritable = (NDArrayWritable) currList.get(0);
featureVector = ndArrayWritable.get();
return new DataSet(featureVector, label);
}
for (int j = 0; j < currList.size(); j++) {
Writable current = currList.get(j);
//ndarray writable is an insane slow down herecd
if (!(current instanceof NDArrayWritable) && current.toString().isEmpty())
continue;
if (regression && j == labelIndex && j == labelIndexTo && current instanceof NDArrayWritable) {
//Case: NDArrayWritable for the labels
label = ((NDArrayWritable) current).get();
} else if (regression && j >= labelIndex && j <= labelIndexTo) {
//This is the multi-label regression case
if (label == null)
label = Nd4j.create(1, (labelIndexTo - labelIndex + 1));
label.putScalar(labelCount++, current.toDouble());
} else if (labelIndex >= 0 && j == labelIndex) {
//single label case (classification, etc)
if (converter != null)
try {
current = converter.convert(current);
} catch (WritableConverterException e) {
e.printStackTrace();
}
if (numPossibleLabels < 1)
throw new IllegalStateException("Number of possible labels invalid, must be >= 1");
if (regression) {
label = Nd4j.scalar(current.toDouble());
} else {
int curr = current.toInt();
if (curr < 0 || curr >= numPossibleLabels) {
throw new DL4JInvalidInputException("Invalid classification data: expect label value (at label index column = " + labelIndex + ") to be in range 0 to " + (numPossibleLabels - 1) + " inclusive (0 to numClasses-1, with numClasses=" + numPossibleLabels + "); got label value of " + current);
}
label = FeatureUtil.toOutcomeVector(curr, numPossibleLabels);
}
} else {
try {
double value = current.toDouble();
if (featureVector == null) {
if (regression && labelIndex >= 0) {
//Handle the possibly multi-label regression case here:
int nLabels = labelIndexTo - labelIndex + 1;
featureVector = Nd4j.create(1, currList.size() - nLabels);
} else {
//Classification case, and also no-labels case
featureVector = Nd4j.create(labelIndex >= 0 ? currList.size() - 1 : currList.size());
}
}
featureVector.putScalar(featureCount++, value);
} catch (UnsupportedOperationException e) {
// This isn't a scalar, so check if we got an array already
if (current instanceof NDArrayWritable) {
assert featureVector == null;
featureVector = ((NDArrayWritable) current).get();
} else {
throw e;
}
}
}
}
return new DataSet(featureVector, labelIndex >= 0 ? label : featureVector);
}
use of org.deeplearning4j.exception.DL4JInvalidInputException in project deeplearning4j by deeplearning4j.
the class SequenceRecordReaderDataSetIterator method getLabels.
private INDArray getLabels(List<List<Writable>> labels) {
//Size of the record?
//[timeSeriesLength,vectorSize]
int[] shape = new int[2];
//time series/sequence length
shape[0] = labels.size();
Iterator<List<Writable>> iter = labels.iterator();
int i = 0;
INDArray out = null;
while (iter.hasNext()) {
List<Writable> step = iter.next();
if (i == 0) {
if (regression) {
for (Writable w : step) {
if (w instanceof NDArrayWritable) {
shape[1] += ((NDArrayWritable) w).get().length();
} else {
shape[1]++;
}
}
} else {
shape[1] = numPossibleLabels;
}
out = Nd4j.create(shape, 'f');
}
Iterator<Writable> timeStepIter = step.iterator();
int f = 0;
if (regression) {
//Load all values
while (timeStepIter.hasNext()) {
Writable current = timeStepIter.next();
if (current instanceof NDArrayWritable) {
INDArray w = ((NDArrayWritable) current).get();
out.put(new INDArrayIndex[] { NDArrayIndex.point(i), NDArrayIndex.interval(f, f + w.length()) }, w);
f += w.length();
} else {
out.put(i, f++, current.toDouble());
}
}
} else {
//Expect a single value (index) -> convert to one-hot vector
Writable value = timeStepIter.next();
int idx = value.toInt();
if (idx < 0 || idx >= numPossibleLabels) {
throw new DL4JInvalidInputException("Invalid classification data: expect label value to be in range 0 to " + (numPossibleLabels - 1) + " inclusive (0 to numClasses-1, with numClasses=" + numPossibleLabels + "); got label value of " + idx);
}
INDArray line = FeatureUtil.toOutcomeVector(idx, numPossibleLabels);
out.getRow(i).assign(line);
}
i++;
}
return out;
}
use of org.deeplearning4j.exception.DL4JInvalidInputException in project deeplearning4j by deeplearning4j.
the class SequenceRecordReaderDataSetIterator method getFeaturesLabelsSingleReader.
private INDArray[] getFeaturesLabelsSingleReader(List<List<Writable>> input) {
Iterator<List<Writable>> iter = input.iterator();
int i = 0;
INDArray features = null;
//= Nd4j.zeros(input.size(), regression ? 1 : numPossibleLabels);
INDArray labels = null;
int featureSize = 0;
while (iter.hasNext()) {
List<Writable> step = iter.next();
if (i == 0) {
//First: determine the features size. Usually equal to the number of Writable objects, except when
// one or more of the Writables is an INDArray (i.e., NDArrayWritable)
int j = 0;
for (Writable w : step) {
if (j++ != labelIndex) {
if (w instanceof NDArrayWritable) {
featureSize += ((NDArrayWritable) w).get().length();
} else {
featureSize += 1;
}
}
}
features = Nd4j.zeros(input.size(), featureSize);
//Second: determine the output (labels) size.
int labelSize;
if (regression) {
if (step.get(labelIndex) instanceof NDArrayWritable) {
labelSize = ((NDArrayWritable) step.get(labelIndex)).get().length();
} else {
labelSize = 1;
}
} else {
//Classification: integer -> one-hot
labelSize = numPossibleLabels;
}
labels = Nd4j.zeros(input.size(), labelSize);
}
Iterator<Writable> timeStepIter = step.iterator();
int countIn = 0;
int countFeatures = 0;
while (timeStepIter.hasNext()) {
Writable current = timeStepIter.next();
if (countIn++ == labelIndex) {
//label
if (regression) {
if (current instanceof NDArrayWritable) {
//Standard case
labels.putRow(i, ((NDArrayWritable) current).get());
} else {
labels.put(i, 0, current.toDouble());
}
} else {
int idx = current.toInt();
if (idx < 0 || idx >= numPossibleLabels) {
throw new DL4JInvalidInputException("Invalid classification data: expect label value (at label index column = " + labelIndex + ") to be in range 0 to " + (numPossibleLabels - 1) + " inclusive (0 to numClasses-1, with numClasses=" + numPossibleLabels + "); got label value of " + current);
}
//Labels initialized as 0s
labels.putScalar(i, current.toInt(), 1.0);
}
} else {
//feature
if (current instanceof NDArrayWritable) {
//NDArrayWritable: multiple values
INDArray w = ((NDArrayWritable) current).get();
int length = w.length();
features.put(new INDArrayIndex[] { NDArrayIndex.point(i), NDArrayIndex.interval(countFeatures, countFeatures + length) }, w);
countFeatures += length;
} else {
//Standard case: single value
features.put(i, countFeatures++, current.toDouble());
}
}
}
i++;
}
return new INDArray[] { features, labels };
}
use of org.deeplearning4j.exception.DL4JInvalidInputException in project deeplearning4j by deeplearning4j.
the class SparkDM method frameSequence.
@Override
public Frame<? extends TrainingMessage> frameSequence(Sequence<ShallowSequenceElement> sequence, AtomicLong nextRandom, double learningRate) {
if (vectorsConfiguration.getSampling() > 0)
sequence = BaseSparkLearningAlgorithm.applySubsampling(sequence, nextRandom, 10L, vectorsConfiguration.getSampling());
int currentWindow = vectorsConfiguration.getWindow();
if (vectorsConfiguration.getVariableWindows() != null && vectorsConfiguration.getVariableWindows().length != 0) {
currentWindow = vectorsConfiguration.getVariableWindows()[RandomUtils.nextInt(vectorsConfiguration.getVariableWindows().length)];
}
if (frame == null)
synchronized (this) {
if (frame == null)
frame = new ThreadLocal<>();
}
if (frame.get() == null)
frame.set(new Frame<CbowRequestMessage>(BasicSequenceProvider.getInstance().getNextValue()));
for (int i = 0; i < sequence.getElements().size(); i++) {
nextRandom.set(Math.abs(nextRandom.get() * 25214903917L + 11));
int b = (int) nextRandom.get() % currentWindow;
int end = currentWindow * 2 + 1 - b;
ShallowSequenceElement currentWord = sequence.getElementByIndex(i);
List<Integer> intsList = new ArrayList<>();
for (int a = b; a < end; a++) {
if (a != currentWindow) {
int c = i - currentWindow + a;
if (c >= 0 && c < sequence.size()) {
ShallowSequenceElement lastWord = sequence.getElementByIndex(c);
intsList.add(lastWord.getIndex());
}
}
}
// basically it's the same as CBOW, we just add labels here
if (sequence.getSequenceLabels() != null) {
for (ShallowSequenceElement label : sequence.getSequenceLabels()) {
intsList.add(label.getIndex());
}
} else
// FIXME: we probably should throw this exception earlier?
throw new DL4JInvalidInputException("Sequence passed via RDD has no labels within, nothing to learn here");
// just converting values to int
int[] windowWords = new int[intsList.size()];
for (int x = 0; x < windowWords.length; x++) {
windowWords[x] = intsList.get(x);
}
if (windowWords.length < 1)
continue;
iterateSample(currentWord, windowWords, nextRandom, learningRate, false, 0, true, null);
}
Frame<CbowRequestMessage> currentFrame = frame.get();
frame.set(new Frame<CbowRequestMessage>(BasicSequenceProvider.getInstance().getNextValue()));
return currentFrame;
}
Aggregations