Search in sources :

Example 41 with INDArray

use of org.nd4j.linalg.api.ndarray.INDArray in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method loadFullModel.

/**
     * This method loads full w2v model, previously saved with writeFullMethod call
     *
     * Deprecation note: Please, consider using readWord2VecModel() or loadStaticModel() method instead
     *
     * @param path - path to previously stored w2v json model
     * @return - Word2Vec instance
     */
@Deprecated
public static Word2Vec loadFullModel(@NonNull String path) throws FileNotFoundException {
    /*
            // TODO: implementation is in process
            We need to restore:
                     1. WeightLookupTable, including syn0 and syn1 matrices
                     2. VocabCache + mark it as SPECIAL, to avoid accidental word removals
         */
    BasicLineIterator iterator = new BasicLineIterator(new File(path));
    // first 3 lines should be processed separately
    String confJson = iterator.nextSentence();
    log.info("Word2Vec conf. JSON: " + confJson);
    VectorsConfiguration configuration = VectorsConfiguration.fromJson(confJson);
    // actually we dont need expTable, since it produces exact results on subsequent runs untill you dont modify expTable size :)
    String eTable = iterator.nextSentence();
    double[] expTable;
    String nTable = iterator.nextSentence();
    if (configuration.getNegative() > 0) {
    // TODO: we probably should parse negTable, but it's not required until vocab changes are introduced. Since on the predefined vocab it will produce exact nTable, the same goes for expTable btw.
    }
    /*
                Since we're restoring vocab from previously serialized model, we can expect minWordFrequency appliance in its vocabulary, so it should NOT be truncated.
                That's why i'm setting minWordFrequency to configuration value, but applying SPECIAL to each word, to avoid truncation
         */
    VocabularyHolder holder = new VocabularyHolder.Builder().minWordFrequency(configuration.getMinWordFrequency()).hugeModelExpected(configuration.isHugeModelExpected()).scavengerActivationThreshold(configuration.getScavengerActivationThreshold()).scavengerRetentionDelay(configuration.getScavengerRetentionDelay()).build();
    AtomicInteger counter = new AtomicInteger(0);
    AbstractCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
    while (iterator.hasNext()) {
        //    log.info("got line: " + iterator.nextSentence());
        String wordJson = iterator.nextSentence();
        VocabularyWord word = VocabularyWord.fromJson(wordJson);
        word.setSpecial(true);
        VocabWord vw = new VocabWord(word.getCount(), word.getWord());
        vw.setIndex(counter.getAndIncrement());
        vw.setIndex(word.getHuffmanNode().getIdx());
        vw.setCodeLength(word.getHuffmanNode().getLength());
        vw.setPoints(arrayToList(word.getHuffmanNode().getPoint(), word.getHuffmanNode().getLength()));
        vw.setCodes(arrayToList(word.getHuffmanNode().getCode(), word.getHuffmanNode().getLength()));
        vocabCache.addToken(vw);
        vocabCache.addWordToIndex(vw.getIndex(), vw.getLabel());
        vocabCache.putVocabWord(vw.getWord());
    }
    // at this moment vocab is restored, and it's time to rebuild Huffman tree
    // since word counters are equal, huffman tree will be equal too
    //holder.updateHuffmanCodes();
    // we definitely don't need UNK word in this scenarion
    //        holder.transferBackToVocabCache(vocabCache, false);
    // now, it's time to transfer syn0/syn1/syn1 neg values
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().negative(configuration.getNegative()).useAdaGrad(configuration.isUseAdaGrad()).lr(configuration.getLearningRate()).cache(vocabCache).vectorLength(configuration.getLayersSize()).build();
    // we create all arrays
    lookupTable.resetWeights(true);
    iterator.reset();
    // we should skip 3 lines from file
    iterator.nextSentence();
    iterator.nextSentence();
    iterator.nextSentence();
    // now, for each word from vocabHolder we'll just transfer actual values
    while (iterator.hasNext()) {
        String wordJson = iterator.nextSentence();
        VocabularyWord word = VocabularyWord.fromJson(wordJson);
        // syn0 transfer
        INDArray syn0 = lookupTable.getSyn0().getRow(vocabCache.indexOf(word.getWord()));
        syn0.assign(Nd4j.create(word.getSyn0()));
        // syn1 transfer
        // syn1 values are being accessed via tree points, but since our goal is just deserialization - we can just push it row by row
        INDArray syn1 = lookupTable.getSyn1().getRow(vocabCache.indexOf(word.getWord()));
        syn1.assign(Nd4j.create(word.getSyn1()));
        // syn1Neg transfer
        if (configuration.getNegative() > 0) {
            INDArray syn1Neg = lookupTable.getSyn1Neg().getRow(vocabCache.indexOf(word.getWord()));
            syn1Neg.assign(Nd4j.create(word.getSyn1Neg()));
        }
    }
    Word2Vec vec = new Word2Vec.Builder(configuration).vocabCache(vocabCache).lookupTable(lookupTable).resetModel(false).build();
    vec.setModelUtils(new BasicModelUtils());
    return vec;
}
Also used : BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) VocabularyHolder(org.deeplearning4j.models.word2vec.wordstore.VocabularyHolder) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) BasicModelUtils(org.deeplearning4j.models.embeddings.reader.impl.BasicModelUtils) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) VocabularyWord(org.deeplearning4j.models.word2vec.wordstore.VocabularyWord) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) ZipFile(java.util.zip.ZipFile)

Example 42 with INDArray

use of org.nd4j.linalg.api.ndarray.INDArray in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readBinaryModel.

/**
     * Read a binary word2vec file.
     *
     * @param modelFile
     *            the File to read
     * @param linebreaks
     *            if true, the reader expects each word/vector to be in a separate line, terminated
     *            by a line break
     * @return a {@link Word2Vec model}
     * @throws NumberFormatException
     * @throws IOException
     * @throws FileNotFoundException
     */
private static Word2Vec readBinaryModel(File modelFile, boolean linebreaks, boolean normalize) throws NumberFormatException, IOException {
    InMemoryLookupTable<VocabWord> lookupTable;
    VocabCache<VocabWord> cache;
    INDArray syn0;
    int words, size;
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    if (originalPeriodic)
        Nd4j.getMemoryManager().togglePeriodicGc(false);
    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);
    try (BufferedInputStream bis = new BufferedInputStream(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile));
        DataInputStream dis = new DataInputStream(bis)) {
        words = Integer.parseInt(readString(dis));
        size = Integer.parseInt(readString(dis));
        syn0 = Nd4j.create(words, size);
        cache = new AbstractCache<>();
        printOutProjectedMemoryUse(words, size, 1);
        lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().cache(cache).useHierarchicSoftmax(false).vectorLength(size).build();
        String word;
        float[] vector = new float[size];
        for (int i = 0; i < words; i++) {
            word = readString(dis);
            log.trace("Loading " + word + " with word " + i);
            for (int j = 0; j < size; j++) {
                vector[j] = readFloat(dis);
            }
            syn0.putRow(i, normalize ? Transforms.unitVec(Nd4j.create(vector)) : Nd4j.create(vector));
            VocabWord vw = new VocabWord(1.0, word);
            vw.setIndex(cache.numWords());
            cache.addToken(vw);
            cache.addWordToIndex(vw.getIndex(), vw.getLabel());
            cache.putVocabWord(word);
            if (linebreaks) {
                // line break
                dis.readByte();
            }
            Nd4j.getMemoryManager().invokeGcOccasionally();
        }
    } finally {
        if (originalPeriodic)
            Nd4j.getMemoryManager().togglePeriodicGc(true);
        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
    }
    lookupTable.setSyn0(syn0);
    Word2Vec ret = new Word2Vec.Builder().useHierarchicSoftmax(false).resetModel(false).layerSize(syn0.columns()).allowParallelTokenization(true).elementsLearningAlgorithm(new SkipGram<VocabWord>()).learningRate(0.025).windowSize(5).workers(1).build();
    ret.setVocab(cache);
    ret.setLookupTable(lookupTable);
    return ret;
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) GZIPInputStream(java.util.zip.GZIPInputStream) INDArray(org.nd4j.linalg.api.ndarray.INDArray) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec)

Example 43 with INDArray

use of org.nd4j.linalg.api.ndarray.INDArray in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method writeWord2VecModel.

/**
     * This method saves Word2Vec model into compressed zip file and sends it to output stream
     * PLEASE NOTE: This method saves FULL model, including syn0 AND syn1
     *
     */
public static void writeWord2VecModel(Word2Vec vectors, OutputStream stream) throws IOException {
    ZipOutputStream zipfile = new ZipOutputStream(new BufferedOutputStream(new CloseShieldOutputStream(stream)));
    ZipEntry syn0 = new ZipEntry("syn0.txt");
    zipfile.putNextEntry(syn0);
    // writing out syn0
    File tempFileSyn0 = File.createTempFile("word2vec", "0");
    tempFileSyn0.deleteOnExit();
    writeWordVectors(vectors.lookupTable(), tempFileSyn0);
    BufferedInputStream fis = new BufferedInputStream(new FileInputStream(tempFileSyn0));
    writeEntry(fis, zipfile);
    fis.close();
    // writing out syn1
    File tempFileSyn1 = File.createTempFile("word2vec", "1");
    tempFileSyn1.deleteOnExit();
    INDArray syn1 = ((InMemoryLookupTable<VocabWord>) vectors.getLookupTable()).getSyn1();
    if (syn1 != null)
        try (PrintWriter writer = new PrintWriter(new FileWriter(tempFileSyn1))) {
            for (int x = 0; x < syn1.rows(); x++) {
                INDArray row = syn1.getRow(x);
                StringBuilder builder = new StringBuilder();
                for (int i = 0; i < row.length(); i++) {
                    builder.append(row.getDouble(i)).append(" ");
                }
                writer.println(builder.toString().trim());
            }
        }
    ZipEntry zSyn1 = new ZipEntry("syn1.txt");
    zipfile.putNextEntry(zSyn1);
    fis = new BufferedInputStream(new FileInputStream(tempFileSyn1));
    writeEntry(fis, zipfile);
    fis.close();
    // writing out syn1
    File tempFileSyn1Neg = File.createTempFile("word2vec", "n");
    tempFileSyn1Neg.deleteOnExit();
    INDArray syn1Neg = ((InMemoryLookupTable<VocabWord>) vectors.getLookupTable()).getSyn1Neg();
    if (syn1Neg != null)
        try (PrintWriter writer = new PrintWriter(new FileWriter(tempFileSyn1Neg))) {
            for (int x = 0; x < syn1Neg.rows(); x++) {
                INDArray row = syn1Neg.getRow(x);
                StringBuilder builder = new StringBuilder();
                for (int i = 0; i < row.length(); i++) {
                    builder.append(row.getDouble(i)).append(" ");
                }
                writer.println(builder.toString().trim());
            }
        }
    ZipEntry zSyn1Neg = new ZipEntry("syn1Neg.txt");
    zipfile.putNextEntry(zSyn1Neg);
    fis = new BufferedInputStream(new FileInputStream(tempFileSyn1Neg));
    writeEntry(fis, zipfile);
    fis.close();
    File tempFileCodes = File.createTempFile("word2vec", "h");
    tempFileCodes.deleteOnExit();
    ZipEntry hC = new ZipEntry("codes.txt");
    zipfile.putNextEntry(hC);
    // writing out huffman tree
    try (PrintWriter writer = new PrintWriter(new FileWriter(tempFileCodes))) {
        for (int i = 0; i < vectors.getVocab().numWords(); i++) {
            VocabWord word = vectors.getVocab().elementAtIndex(i);
            StringBuilder builder = new StringBuilder(encodeB64(word.getLabel())).append(" ");
            for (int code : word.getCodes()) {
                builder.append(code).append(" ");
            }
            writer.println(builder.toString().trim());
        }
    }
    fis = new BufferedInputStream(new FileInputStream(tempFileCodes));
    writeEntry(fis, zipfile);
    fis.close();
    File tempFileHuffman = File.createTempFile("word2vec", "h");
    tempFileHuffman.deleteOnExit();
    ZipEntry hP = new ZipEntry("huffman.txt");
    zipfile.putNextEntry(hP);
    // writing out huffman tree
    try (PrintWriter writer = new PrintWriter(new FileWriter(tempFileHuffman))) {
        for (int i = 0; i < vectors.getVocab().numWords(); i++) {
            VocabWord word = vectors.getVocab().elementAtIndex(i);
            StringBuilder builder = new StringBuilder(encodeB64(word.getLabel())).append(" ");
            for (int point : word.getPoints()) {
                builder.append(point).append(" ");
            }
            writer.println(builder.toString().trim());
        }
    }
    fis = new BufferedInputStream(new FileInputStream(tempFileHuffman));
    writeEntry(fis, zipfile);
    fis.close();
    File tempFileFreqs = File.createTempFile("word2vec", "f");
    tempFileFreqs.deleteOnExit();
    ZipEntry hF = new ZipEntry("frequencies.txt");
    zipfile.putNextEntry(hF);
    // writing out word frequencies
    try (PrintWriter writer = new PrintWriter(new FileWriter(tempFileFreqs))) {
        for (int i = 0; i < vectors.getVocab().numWords(); i++) {
            VocabWord word = vectors.getVocab().elementAtIndex(i);
            StringBuilder builder = new StringBuilder(encodeB64(word.getLabel())).append(" ").append(word.getElementFrequency()).append(" ").append(vectors.getVocab().docAppearedIn(word.getLabel()));
            writer.println(builder.toString().trim());
        }
    }
    fis = new BufferedInputStream(new FileInputStream(tempFileFreqs));
    writeEntry(fis, zipfile);
    fis.close();
    ZipEntry config = new ZipEntry("config.json");
    zipfile.putNextEntry(config);
    //log.info("Current config: {}", vectors.getConfiguration().toJson());
    writeEntry(new ByteArrayInputStream(vectors.getConfiguration().toJson().getBytes()), zipfile);
    zipfile.flush();
    zipfile.close();
    try {
        tempFileCodes.delete();
        tempFileFreqs.delete();
        tempFileHuffman.delete();
        tempFileSyn0.delete();
        tempFileSyn1.delete();
        tempFileSyn1Neg.delete();
    } catch (Exception e) {
    //
    }
}
Also used : ZipEntry(java.util.zip.ZipEntry) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ZipOutputStream(java.util.zip.ZipOutputStream) ZipFile(java.util.zip.ZipFile)

Example 44 with INDArray

use of org.nd4j.linalg.api.ndarray.INDArray in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method writeTsneFormat.

/**
     * Write the tsne format
     *
     * @param vec
     *            the word vectors to use for labeling
     * @param tsne
     *            the tsne array to write
     * @param csv
     *            the file to use
     * @throws Exception
     */
public static void writeTsneFormat(Glove vec, INDArray tsne, File csv) throws Exception {
    BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), "UTF-8"));
    int words = 0;
    InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
    for (String word : vec.vocab().words()) {
        if (word == null) {
            continue;
        }
        StringBuilder sb = new StringBuilder();
        INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
        for (int j = 0; j < wordVector.length(); j++) {
            sb.append(wordVector.getDouble(j));
            if (j < wordVector.length() - 1) {
                sb.append(",");
            }
        }
        sb.append(",");
        sb.append(word.replaceAll(" ", whitespaceReplacement));
        sb.append(" ");
        sb.append("\n");
        write.write(sb.toString());
    }
    log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
    write.flush();
    write.close();
}
Also used : INDArray(org.nd4j.linalg.api.ndarray.INDArray) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache)

Example 45 with INDArray

use of org.nd4j.linalg.api.ndarray.INDArray in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method loadTxt.

/**
     * Loads an in memory cache from the given path (sets syn0 and the vocab)
     *
     * @param vectorsFile the path of the file to load
     * @return a Pair holding the lookup table and the vocab cache.
     * @throws FileNotFoundException if the input file does not exist
     */
public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile) throws FileNotFoundException, UnsupportedEncodingException {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(vectorsFile), "UTF-8"));
    AbstractCache cache = new AbstractCache<>();
    LineIterator iter = IOUtils.lineIterator(reader);
    String line = null;
    boolean hasHeader = false;
    if (iter.hasNext()) {
        // skip header line
        line = iter.nextLine();
        //look for spaces
        if (!line.contains(" ")) {
            log.debug("Skipping first line");
            hasHeader = true;
        } else {
            // we should check for something that looks like proper word vectors here. i.e: 1 word at the 0 position, and bunch of floats further
            String[] split = line.split(" ");
            try {
                long[] header = new long[split.length];
                for (int x = 0; x < split.length; x++) {
                    header[x] = Long.parseLong(split[x]);
                }
                if (split.length < 4)
                    hasHeader = true;
                // [2] - number of documents <-- DL4j-only value
                if (split.length == 3)
                    cache.incrementTotalDocCount(header[2]);
                printOutProjectedMemoryUse(header[0], (int) header[1], 1);
                hasHeader = true;
                try {
                    reader.close();
                } catch (Exception ex) {
                }
            } catch (Exception e) {
                // if any conversion exception hits - that'll be considered header
                hasHeader = false;
            }
        }
    }
    //reposition buffer to be one line ahead
    if (hasHeader) {
        line = "";
        iter.close();
        reader = new BufferedReader(new FileReader(vectorsFile));
        iter = IOUtils.lineIterator(reader);
        iter.nextLine();
    }
    List<INDArray> arrays = new ArrayList<>();
    while (iter.hasNext()) {
        if (line.isEmpty())
            line = iter.nextLine();
        String[] split = line.split(" ");
        //split[0].replaceAll(whitespaceReplacement, " ");
        String word = decodeB64(split[0]);
        VocabWord word1 = new VocabWord(1.0, word);
        word1.setIndex(cache.numWords());
        cache.addToken(word1);
        cache.addWordToIndex(word1.getIndex(), word);
        cache.putVocabWord(word);
        float[] vector = new float[split.length - 1];
        for (int i = 1; i < split.length; i++) {
            vector[i - 1] = Float.parseFloat(split[i]);
        }
        INDArray row = Nd4j.create(vector);
        arrays.add(row);
        // workaround for skipped first row
        line = "";
    }
    INDArray syn = Nd4j.vstack(arrays);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().vectorLength(arrays.get(0).columns()).useAdaGrad(false).cache(cache).useHierarchicSoftmax(false).build();
    if (Nd4j.ENFORCE_NUMERICAL_STABILITY)
        Nd4j.clearNans(syn);
    lookupTable.setSyn0(syn);
    iter.close();
    try {
        reader.close();
    } catch (Exception e) {
    }
    return new Pair<>(lookupTable, (VocabCache) cache);
}
Also used : ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) LineIterator(org.apache.commons.io.LineIterator) BasicLineIterator(org.deeplearning4j.text.sentenceiterator.BasicLineIterator) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) Pair(org.deeplearning4j.berkeley.Pair)

Aggregations

INDArray (org.nd4j.linalg.api.ndarray.INDArray)1034 Test (org.junit.Test)453 NeuralNetConfiguration (org.deeplearning4j.nn.conf.NeuralNetConfiguration)173 DataSet (org.nd4j.linalg.dataset.DataSet)171 MultiLayerNetwork (org.deeplearning4j.nn.multilayer.MultiLayerNetwork)166 MultiLayerConfiguration (org.deeplearning4j.nn.conf.MultiLayerConfiguration)143 Gradient (org.deeplearning4j.nn.gradient.Gradient)100 Layer (org.deeplearning4j.nn.api.Layer)82 NormalDistribution (org.deeplearning4j.nn.conf.distribution.NormalDistribution)77 OutputLayer (org.deeplearning4j.nn.conf.layers.OutputLayer)69 DefaultGradient (org.deeplearning4j.nn.gradient.DefaultGradient)68 File (java.io.File)67 DenseLayer (org.deeplearning4j.nn.conf.layers.DenseLayer)66 ArrayList (java.util.ArrayList)65 ComputationGraph (org.deeplearning4j.nn.graph.ComputationGraph)62 DataSetIterator (org.nd4j.linalg.dataset.api.iterator.DataSetIterator)62 Pair (org.deeplearning4j.berkeley.Pair)56 Random (java.util.Random)54 ComputationGraphConfiguration (org.deeplearning4j.nn.conf.ComputationGraphConfiguration)53 IrisDataSetIterator (org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator)44