Search in sources :

Example 16 with ZipEntry

use of java.util.zip.ZipEntry in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readWord2VecModel.

/**
     * This method
     * 1) Binary model, either compressed or not. Like well-known Google Model
     * 2) Popular CSV word2vec text format
     * 3) DL4j compressed format
     *
     * Please note: if extended data isn't available, only weights will be loaded instead.
     *
     * @param file
     * @param extendedModel if TRUE, we'll try to load HS states & Huffman tree info, if FALSE, only weights will be loaded
     * @return
     */
public static Word2Vec readWord2VecModel(@NonNull File file, boolean extendedModel) {
    InMemoryLookupTable<VocabWord> lookupTable = new InMemoryLookupTable<>();
    AbstractCache<VocabWord> vocabCache = new AbstractCache<>();
    Word2Vec vec;
    INDArray syn0 = null;
    VectorsConfiguration configuration = new VectorsConfiguration();
    if (!file.exists() || !file.isFile())
        throw new ND4JIllegalStateException("File [" + file.getAbsolutePath() + "] doesn't exist");
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    if (originalPeriodic)
        Nd4j.getMemoryManager().togglePeriodicGc(false);
    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);
    // try to load zip format
    try {
        if (extendedModel) {
            log.debug("Trying full model restoration...");
            if (originalPeriodic)
                Nd4j.getMemoryManager().togglePeriodicGc(true);
            Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
            return readWord2Vec(file);
        } else {
            log.debug("Trying simplified model restoration...");
            File tmpFileSyn0 = File.createTempFile("word2vec", "syn");
            File tmpFileConfig = File.createTempFile("word2vec", "config");
            // we don't need full model, so we go directly to syn0 file
            ZipFile zipFile = new ZipFile(file);
            ZipEntry syn = zipFile.getEntry("syn0.txt");
            InputStream stream = zipFile.getInputStream(syn);
            Files.copy(stream, Paths.get(tmpFileSyn0.getAbsolutePath()), StandardCopyOption.REPLACE_EXISTING);
            // now we're restoring configuration saved earlier
            ZipEntry config = zipFile.getEntry("config.json");
            if (config != null) {
                stream = zipFile.getInputStream(config);
                StringBuilder builder = new StringBuilder();
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
                    String line;
                    while ((line = reader.readLine()) != null) {
                        builder.append(line);
                    }
                }
                configuration = VectorsConfiguration.fromJson(builder.toString().trim());
            }
            ZipEntry ve = zipFile.getEntry("frequencies.txt");
            if (ve != null) {
                stream = zipFile.getInputStream(ve);
                AtomicInteger cnt = new AtomicInteger(0);
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream))) {
                    String line;
                    while ((line = reader.readLine()) != null) {
                        String[] split = line.split(" ");
                        VocabWord word = new VocabWord(Double.valueOf(split[1]), decodeB64(split[0]));
                        word.setIndex(cnt.getAndIncrement());
                        word.incrementSequencesCount(Long.valueOf(split[2]));
                        vocabCache.addToken(word);
                        vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
                        Nd4j.getMemoryManager().invokeGcOccasionally();
                    }
                }
            }
            List<INDArray> rows = new ArrayList<>();
            // basically read up everything, call vstacl and then return model
            try (Reader reader = new CSVReader(tmpFileSyn0)) {
                AtomicInteger cnt = new AtomicInteger(0);
                while (reader.hasNext()) {
                    Pair<VocabWord, float[]> pair = reader.next();
                    VocabWord word = pair.getFirst();
                    INDArray vector = Nd4j.create(pair.getSecond());
                    if (ve != null) {
                        if (syn0 == null)
                            syn0 = Nd4j.create(vocabCache.numWords(), vector.length());
                        syn0.getRow(cnt.getAndIncrement()).assign(vector);
                    } else {
                        rows.add(vector);
                        vocabCache.addToken(word);
                        vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
                    }
                    Nd4j.getMemoryManager().invokeGcOccasionally();
                }
            } catch (Exception e) {
                throw new RuntimeException(e);
            } finally {
                if (originalPeriodic)
                    Nd4j.getMemoryManager().togglePeriodicGc(true);
                Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
            }
            if (syn0 == null && vocabCache.numWords() > 0)
                syn0 = Nd4j.vstack(rows);
            if (syn0 == null) {
                log.error("Can't build syn0 table");
                throw new DL4JInvalidInputException("Can't build syn0 table");
            }
            lookupTable = new InMemoryLookupTable.Builder<VocabWord>().cache(vocabCache).vectorLength(syn0.columns()).useHierarchicSoftmax(false).useAdaGrad(false).build();
            lookupTable.setSyn0(syn0);
            try {
                tmpFileSyn0.delete();
                tmpFileConfig.delete();
            } catch (Exception e) {
            //
            }
        }
    } catch (Exception e) {
        // let's try to load this file as csv file
        try {
            log.debug("Trying CSV model restoration...");
            Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(file);
            lookupTable = pair.getFirst();
            vocabCache = (AbstractCache<VocabWord>) pair.getSecond();
        } catch (Exception ex) {
            // we fallback to trying binary model instead
            try {
                log.debug("Trying binary model restoration...");
                if (originalPeriodic)
                    Nd4j.getMemoryManager().togglePeriodicGc(true);
                Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
                vec = loadGoogleModel(file, true, true);
                return vec;
            } catch (Exception ey) {
                // try to load without linebreaks
                try {
                    if (originalPeriodic)
                        Nd4j.getMemoryManager().togglePeriodicGc(true);
                    Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
                    vec = loadGoogleModel(file, true, false);
                    return vec;
                } catch (Exception ez) {
                    throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly");
                }
            }
        }
    }
    Word2Vec.Builder builder = new Word2Vec.Builder(configuration).lookupTable(lookupTable).useAdaGrad(false).vocabCache(vocabCache).layerSize(lookupTable.layerSize()).useHierarchicSoftmax(false).resetModel(false);
    /*
            Trying to restore TokenizerFactory & TokenPreProcessor
         */
    TokenizerFactory factory = getTokenizerFactory(configuration);
    if (factory != null)
        builder.tokenizerFactory(factory);
    vec = builder.build();
    return vec;
}
Also used : ZipEntry(java.util.zip.ZipEntry) ArrayList(java.util.ArrayList) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) Pair(org.deeplearning4j.berkeley.Pair) TokenizerFactory(org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory) GZIPInputStream(java.util.zip.GZIPInputStream) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ZipFile(java.util.zip.ZipFile) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) ZipFile(java.util.zip.ZipFile) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException)

Example 17 with ZipEntry

use of java.util.zip.ZipEntry in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readParagraphVectors.

/**
     * This method restores ParagraphVectors model previously saved with writeParagraphVectors()
     *
     * @return
     */
public static ParagraphVectors readParagraphVectors(File file) throws IOException {
    File tmpFileL = File.createTempFile("paravec", "l");
    tmpFileL.deleteOnExit();
    Word2Vec w2v = readWord2Vec(file);
    // and "convert" it to ParaVec model + optionally trying to restore labels information
    ParagraphVectors vectors = new ParagraphVectors.Builder(w2v.getConfiguration()).vocabCache(w2v.getVocab()).lookupTable(w2v.getLookupTable()).resetModel(false).build();
    ZipFile zipFile = new ZipFile(file);
    // now we try to restore labels information
    ZipEntry labels = zipFile.getEntry("labels.txt");
    if (labels != null) {
        InputStream stream = zipFile.getInputStream(labels);
        Files.copy(stream, Paths.get(tmpFileL.getAbsolutePath()), StandardCopyOption.REPLACE_EXISTING);
        try (BufferedReader reader = new BufferedReader(new FileReader(tmpFileL))) {
            String line;
            while ((line = reader.readLine()) != null) {
                VocabWord word = vectors.getVocab().tokenFor(decodeB64(line.trim()));
                if (word != null) {
                    word.markAsLabel(true);
                }
            }
        }
    }
    vectors.extractLabels();
    return vectors;
}
Also used : ZipFile(java.util.zip.ZipFile) GZIPInputStream(java.util.zip.GZIPInputStream) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec) ZipEntry(java.util.zip.ZipEntry) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) ZipFile(java.util.zip.ZipFile) ParagraphVectors(org.deeplearning4j.models.paragraphvectors.ParagraphVectors)

Example 18 with ZipEntry

use of java.util.zip.ZipEntry in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method loadStaticModel.

/**
     * This method restores previously saved w2v model. File can be in one of the following formats:
     * 1) Binary model, either compressed or not. Like well-known Google Model
     * 2) Popular CSV word2vec text format
     * 3) DL4j compressed format
     *
     * In return you get StaticWord2Vec model, which might be used as lookup table only in multi-gpu environment.
     *
     * @param file File should point to previously saved w2v model
     * @return
     */
// TODO: this method needs better name :)
public static WordVectors loadStaticModel(File file) {
    if (!file.exists() || file.isDirectory())
        throw new RuntimeException(new FileNotFoundException("File [" + file.getAbsolutePath() + "] was not found"));
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    if (originalPeriodic)
        Nd4j.getMemoryManager().togglePeriodicGc(false);
    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);
    CompressedRamStorage<Integer> storage = new CompressedRamStorage.Builder<Integer>().useInplaceCompression(false).setCompressor(new NoOp()).emulateIsAbsent(false).build();
    VocabCache<VocabWord> vocabCache = new AbstractCache.Builder<VocabWord>().build();
    // if zip - that's dl4j format
    try {
        log.debug("Trying DL4j format...");
        File tmpFileSyn0 = File.createTempFile("word2vec", "syn");
        ZipFile zipFile = new ZipFile(file);
        ZipEntry syn0 = zipFile.getEntry("syn0.txt");
        InputStream stream = zipFile.getInputStream(syn0);
        Files.copy(stream, Paths.get(tmpFileSyn0.getAbsolutePath()), StandardCopyOption.REPLACE_EXISTING);
        storage.clear();
        try (Reader reader = new CSVReader(tmpFileSyn0)) {
            while (reader.hasNext()) {
                Pair<VocabWord, float[]> pair = reader.next();
                VocabWord word = pair.getFirst();
                storage.store(word.getIndex(), pair.getSecond());
                vocabCache.addToken(word);
                vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
                Nd4j.getMemoryManager().invokeGcOccasionally();
            }
        } catch (Exception e) {
            throw new RuntimeException(e);
        } finally {
            if (originalPeriodic)
                Nd4j.getMemoryManager().togglePeriodicGc(true);
            Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
        }
    } catch (Exception e) {
        //
        try {
            // try to load file as text csv
            vocabCache = new AbstractCache.Builder<VocabWord>().build();
            storage.clear();
            log.debug("Trying CSVReader...");
            try (Reader reader = new CSVReader(file)) {
                while (reader.hasNext()) {
                    Pair<VocabWord, float[]> pair = reader.next();
                    VocabWord word = pair.getFirst();
                    storage.store(word.getIndex(), pair.getSecond());
                    vocabCache.addToken(word);
                    vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
                    Nd4j.getMemoryManager().invokeGcOccasionally();
                }
            } catch (Exception ef) {
                // we throw away this exception, and trying to load data as binary model
                throw new RuntimeException(ef);
            } finally {
                if (originalPeriodic)
                    Nd4j.getMemoryManager().togglePeriodicGc(true);
                Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
            }
        } catch (Exception ex) {
            // otherwise it's probably google model. which might be compressed or not
            log.debug("Trying BinaryReader...");
            vocabCache = new AbstractCache.Builder<VocabWord>().build();
            storage.clear();
            try (Reader reader = new BinaryReader(file)) {
                while (reader.hasNext()) {
                    Pair<VocabWord, float[]> pair = reader.next();
                    VocabWord word = pair.getFirst();
                    storage.store(word.getIndex(), pair.getSecond());
                    vocabCache.addToken(word);
                    vocabCache.addWordToIndex(word.getIndex(), word.getLabel());
                    Nd4j.getMemoryManager().invokeGcOccasionally();
                }
            } catch (Exception ez) {
                throw new RuntimeException("Unable to guess input file format");
            } finally {
                if (originalPeriodic)
                    Nd4j.getMemoryManager().togglePeriodicGc(true);
                Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
            }
        } finally {
            if (originalPeriodic)
                Nd4j.getMemoryManager().togglePeriodicGc(true);
            Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
        }
    }
    StaticWord2Vec word2Vec = new StaticWord2Vec.Builder(storage, vocabCache).build();
    return word2Vec;
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) NoOp(org.nd4j.compression.impl.NoOp) ZipEntry(java.util.zip.ZipEntry) VocabWord(org.deeplearning4j.models.word2vec.VocabWord) AbstractCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.AbstractCache) DL4JInvalidInputException(org.deeplearning4j.exception.DL4JInvalidInputException) ND4JIllegalStateException(org.nd4j.linalg.exception.ND4JIllegalStateException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) ZipFile(java.util.zip.ZipFile) ZipFile(java.util.zip.ZipFile) Pair(org.deeplearning4j.berkeley.Pair)

Example 19 with ZipEntry

use of java.util.zip.ZipEntry in project druid by druid-io.

the class JobHelper method createNewZipEntry.

private static void createNewZipEntry(ZipOutputStream out, File file) throws IOException {
    log.info("Creating new ZipEntry[%s]", file.getName());
    out.putNextEntry(new ZipEntry(file.getName()));
}
Also used : ZipEntry(java.util.zip.ZipEntry)

Example 20 with ZipEntry

use of java.util.zip.ZipEntry in project cogtool by cogtool.

the class ZipUtil method unzip.

/**
   * Unzips a zipfile to a destination directory.
   * @param zip the file to unzip
   * @param fileDir the destination directory for zipfile contents
   * @throws FileNotFoundException
   * @throws IOException
   */
public static void unzip(ZipFile zip, File fileDir) throws FileNotFoundException, IOException {
    // Read out all entries from ZipFile via input streams
    for (Enumeration<? extends ZipEntry> en = zip.entries(); en.hasMoreElements(); ) {
        ZipEntry ze = en.nextElement();
        // Get info from file entry
        long size = ze.getSize();
        // Create File in fileDir for unpacked entry
        String name = ze.getName();
        //      System.out.println("Unzipping: " + name);
        File zeFile = new File(fileDir, name);
        // Check for a trailing slash to see if this is a directory entry
        if (name.charAt(name.length() - 1) == '/') {
            // If this entry is a directory, make it
            zeFile.mkdirs();
        } else {
            // if this entry is a file, make its parent directories, then it
            zeFile.getParentFile().mkdirs();
            zeFile.createNewFile();
            // Create plus OutputStream to the new file
            FileOutputStream fout = null;
            OutputStream out = null;
            // Get ZipInputStream for reading data
            InputStream zin = null;
            try {
                fout = new FileOutputStream(zeFile);
                out = new BufferedOutputStream(fout);
                zin = zip.getInputStream(ze);
                // Set modification time
                zeFile.setLastModified(ze.getTime());
                // Copy data from zin to out, 100k at a time
                int chunkSize = 100 * 1024;
                byte[] buff = new byte[chunkSize];
                int len = chunkSize;
                for (; size > 0; size -= len) {
                    if (size < chunkSize) {
                        len = (int) size;
                    } else {
                        len = chunkSize;
                    }
                    int actualBytes = 0;
                    int off = 0;
                    do {
                        actualBytes = zin.read(buff, off, len);
                        if (actualBytes == -1) {
                            out.write(buff, off, len);
                            //                System.out.print("!" + len + ':' + actualBytes + ':' + off + ' ');
                            throw new RuntimeException("Bad math in unzip!");
                        } else {
                            out.write(buff, off, actualBytes);
                        //                System.out.print("" + len + ':' + actualBytes + ':' + off + ' ');
                        }
                        len -= actualBytes;
                        size -= actualBytes;
                        off += actualBytes;
                    } while ((len > 0));
                }
            } finally {
                // Close the streams
                if (fout != null) {
                    if (out != null) {
                        if (zin != null) {
                            zin.close();
                        }
                        out.close();
                    }
                    fout.close();
                }
            }
        }
    }
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ZipEntry(java.util.zip.ZipEntry) FileOutputStream(java.io.FileOutputStream) OutputStream(java.io.OutputStream) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) BufferedOutputStream(java.io.BufferedOutputStream) File(java.io.File) ZipFile(java.util.zip.ZipFile) BufferedOutputStream(java.io.BufferedOutputStream)

Aggregations

ZipEntry (java.util.zip.ZipEntry)1367 ZipFile (java.util.zip.ZipFile)479 File (java.io.File)469 IOException (java.io.IOException)361 ZipOutputStream (java.util.zip.ZipOutputStream)321 ZipInputStream (java.util.zip.ZipInputStream)300 InputStream (java.io.InputStream)282 FileOutputStream (java.io.FileOutputStream)278 FileInputStream (java.io.FileInputStream)270 Test (org.junit.Test)124 BufferedInputStream (java.io.BufferedInputStream)122 JarFile (java.util.jar.JarFile)122 BufferedOutputStream (java.io.BufferedOutputStream)99 ByteArrayOutputStream (java.io.ByteArrayOutputStream)97 ArrayList (java.util.ArrayList)84 ByteArrayInputStream (java.io.ByteArrayInputStream)78 OutputStream (java.io.OutputStream)67 JarOutputStream (java.util.jar.JarOutputStream)59 Path (java.nio.file.Path)56 Enumeration (java.util.Enumeration)56