Search in sources :

Example 11 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project druid by druid-io.

the class WikipediaIrcDecoder method downloadGeoLiteDbToFile.

private void downloadGeoLiteDbToFile(File geoDb) {
    if (geoDb.exists()) {
        return;
    }
    try {
        log.info("Downloading geo ip database to [%s]. This may take a few minutes.", geoDb.getAbsolutePath());
        File tmpFile = File.createTempFile("druid", "geo");
        FileUtils.copyInputStreamToFile(new GZIPInputStream(new URL("http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz").openStream()), tmpFile);
        if (!tmpFile.renameTo(geoDb)) {
            throw new RuntimeException("Unable to move geo file to [" + geoDb.getAbsolutePath() + "]!");
        }
    } catch (IOException e) {
        throw new RuntimeException("Unable to download geo ip database.", e);
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) IOException(java.io.IOException) File(java.io.File) URL(java.net.URL)

Example 12 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project druid by druid-io.

the class CompressionUtils method gunzip.

/**
   * gunzip from the source stream to the destination stream.
   *
   * @param in  The input stream which is to be decompressed. This stream is closed.
   * @param out The output stream to write to. This stream is closed
   *
   * @return The number of bytes written to the output stream.
   *
   * @throws IOException
   */
public static long gunzip(InputStream in, OutputStream out) throws IOException {
    try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
        final long result = ByteStreams.copy(gzipInputStream, out);
        out.flush();
        return result;
    } finally {
        out.close();
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream)

Example 13 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project druid by druid-io.

the class CompressionUtilsTest method testGoodGZStream.

@Test
public void testGoodGZStream() throws IOException {
    final File tmpDir = temporaryFolder.newFolder("testGoodGZStream");
    final File gzFile = new File(tmpDir, testFile.getName() + ".gz");
    Assert.assertFalse(gzFile.exists());
    CompressionUtils.gzip(new FileInputStream(testFile), new FileOutputStream(gzFile));
    Assert.assertTrue(gzFile.exists());
    try (final InputStream inputStream = new GZIPInputStream(new FileInputStream(gzFile))) {
        assertGoodDataStream(inputStream);
    }
    if (!testFile.delete()) {
        throw new IOException(String.format("Unable to delete file [%s]", testFile.getAbsolutePath()));
    }
    Assert.assertFalse(testFile.exists());
    CompressionUtils.gunzip(new FileInputStream(gzFile), testFile);
    Assert.assertTrue(testFile.exists());
    try (final InputStream inputStream = new FileInputStream(testFile)) {
        assertGoodDataStream(inputStream);
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) FilterInputStream(java.io.FilterInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) File(java.io.File) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 14 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readBinaryModel.

/**
     * Read a binary word2vec file.
     *
     * @param modelFile
     *            the File to read
     * @param linebreaks
     *            if true, the reader expects each word/vector to be in a separate line, terminated
     *            by a line break
     * @return a {@link Word2Vec model}
     * @throws NumberFormatException
     * @throws IOException
     * @throws FileNotFoundException
     */
private static Word2Vec readBinaryModel(File modelFile, boolean linebreaks, boolean normalize) throws NumberFormatException, IOException {
    InMemoryLookupTable<VocabWord> lookupTable;
    VocabCache<VocabWord> cache;
    INDArray syn0;
    int words, size;
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    if (originalPeriodic)
        Nd4j.getMemoryManager().togglePeriodicGc(false);
    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);
    try (BufferedInputStream bis = new BufferedInputStream(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile));
        DataInputStream dis = new DataInputStream(bis)) {
        words = Integer.parseInt(readString(dis));
        size = Integer.parseInt(readString(dis));
        syn0 = Nd4j.create(words, size);
        cache = new AbstractCache<>();
        printOutProjectedMemoryUse(words, size, 1);
        lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().cache(cache).useHierarchicSoftmax(false).vectorLength(size).build();
        String word;
        float[] vector = new float[size];
        for (int i = 0; i < words; i++) {
            word = readString(dis);
            log.trace("Loading " + word + " with word " + i);
            for (int j = 0; j < size; j++) {
                vector[j] = readFloat(dis);
            }
            syn0.putRow(i, normalize ? Transforms.unitVec(Nd4j.create(vector)) : Nd4j.create(vector));
            VocabWord vw = new VocabWord(1.0, word);
            vw.setIndex(cache.numWords());
            cache.addToken(vw);
            cache.addWordToIndex(vw.getIndex(), vw.getLabel());
            cache.putVocabWord(word);
            if (linebreaks) {
                // line break
                dis.readByte();
            }
            Nd4j.getMemoryManager().invokeGcOccasionally();
        }
    } finally {
        if (originalPeriodic)
            Nd4j.getMemoryManager().togglePeriodicGc(true);
        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
    }
    lookupTable.setSyn0(syn0);
    Word2Vec ret = new Word2Vec.Builder().useHierarchicSoftmax(false).resetModel(false).layerSize(syn0.columns()).allowParallelTokenization(true).elementsLearningAlgorithm(new SkipGram<VocabWord>()).learningRate(0.025).windowSize(5).workers(1).build();
    ret.setVocab(cache);
    ret.setLookupTable(lookupTable);
    return ret;
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) GZIPInputStream(java.util.zip.GZIPInputStream) INDArray(org.nd4j.linalg.api.ndarray.INDArray) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec)

Example 15 with GZIPInputStream

use of java.util.zip.GZIPInputStream in project deeplearning4j by deeplearning4j.

the class WordVectorSerializer method readTextModel.

/**
     * @param modelFile
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     * @throws NumberFormatException
     */
private static Word2Vec readTextModel(File modelFile) throws IOException, NumberFormatException {
    InMemoryLookupTable lookupTable;
    VocabCache cache;
    INDArray syn0;
    Word2Vec ret = new Word2Vec();
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile), "UTF-8"))) {
        String line = reader.readLine();
        String[] initial = line.split(" ");
        int words = Integer.parseInt(initial[0]);
        int layerSize = Integer.parseInt(initial[1]);
        syn0 = Nd4j.create(words, layerSize);
        cache = new InMemoryLookupCache(false);
        int currLine = 0;
        while ((line = reader.readLine()) != null) {
            String[] split = line.split(" ");
            assert split.length == layerSize + 1;
            String word = split[0].replaceAll(whitespaceReplacement, " ");
            float[] vector = new float[split.length - 1];
            for (int i = 1; i < split.length; i++) {
                vector[i - 1] = Float.parseFloat(split[i]);
            }
            syn0.putRow(currLine, Nd4j.create(vector));
            cache.addWordToIndex(cache.numWords(), word);
            cache.addToken(new VocabWord(1, word));
            cache.putVocabWord(word);
            currLine++;
        }
        lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache).vectorLength(layerSize).build();
        lookupTable.setSyn0(syn0);
        ret.setVocab(cache);
        ret.setLookupTable(lookupTable);
    }
    return ret;
}
Also used : VocabWord(org.deeplearning4j.models.word2vec.VocabWord) InMemoryLookupCache(org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache) GZIPInputStream(java.util.zip.GZIPInputStream) InMemoryLookupTable(org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VocabCache(org.deeplearning4j.models.word2vec.wordstore.VocabCache) StaticWord2Vec(org.deeplearning4j.models.word2vec.StaticWord2Vec) Word2Vec(org.deeplearning4j.models.word2vec.Word2Vec)

Aggregations

GZIPInputStream (java.util.zip.GZIPInputStream)376 InputStream (java.io.InputStream)144 IOException (java.io.IOException)125 ByteArrayInputStream (java.io.ByteArrayInputStream)120 FileInputStream (java.io.FileInputStream)98 ByteArrayOutputStream (java.io.ByteArrayOutputStream)77 InputStreamReader (java.io.InputStreamReader)57 File (java.io.File)56 BufferedReader (java.io.BufferedReader)45 BufferedInputStream (java.io.BufferedInputStream)41 Test (org.junit.Test)41 FileOutputStream (java.io.FileOutputStream)30 URL (java.net.URL)25 InflaterInputStream (java.util.zip.InflaterInputStream)25 OutputStream (java.io.OutputStream)24 GZIPOutputStream (java.util.zip.GZIPOutputStream)21 ObjectInputStream (java.io.ObjectInputStream)19 HttpURLConnection (java.net.HttpURLConnection)19 URLConnection (java.net.URLConnection)17 HashMap (java.util.HashMap)15