use of java.util.zip.GZIPInputStream in project druid by druid-io.
the class WikipediaIrcDecoder method downloadGeoLiteDbToFile.
private void downloadGeoLiteDbToFile(File geoDb) {
if (geoDb.exists()) {
return;
}
try {
log.info("Downloading geo ip database to [%s]. This may take a few minutes.", geoDb.getAbsolutePath());
File tmpFile = File.createTempFile("druid", "geo");
FileUtils.copyInputStreamToFile(new GZIPInputStream(new URL("http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.mmdb.gz").openStream()), tmpFile);
if (!tmpFile.renameTo(geoDb)) {
throw new RuntimeException("Unable to move geo file to [" + geoDb.getAbsolutePath() + "]!");
}
} catch (IOException e) {
throw new RuntimeException("Unable to download geo ip database.", e);
}
}
use of java.util.zip.GZIPInputStream in project druid by druid-io.
the class CompressionUtils method gunzip.
/**
* gunzip from the source stream to the destination stream.
*
* @param in The input stream which is to be decompressed. This stream is closed.
* @param out The output stream to write to. This stream is closed
*
* @return The number of bytes written to the output stream.
*
* @throws IOException
*/
public static long gunzip(InputStream in, OutputStream out) throws IOException {
try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
final long result = ByteStreams.copy(gzipInputStream, out);
out.flush();
return result;
} finally {
out.close();
}
}
use of java.util.zip.GZIPInputStream in project druid by druid-io.
the class CompressionUtilsTest method testGoodGZStream.
@Test
public void testGoodGZStream() throws IOException {
final File tmpDir = temporaryFolder.newFolder("testGoodGZStream");
final File gzFile = new File(tmpDir, testFile.getName() + ".gz");
Assert.assertFalse(gzFile.exists());
CompressionUtils.gzip(new FileInputStream(testFile), new FileOutputStream(gzFile));
Assert.assertTrue(gzFile.exists());
try (final InputStream inputStream = new GZIPInputStream(new FileInputStream(gzFile))) {
assertGoodDataStream(inputStream);
}
if (!testFile.delete()) {
throw new IOException(String.format("Unable to delete file [%s]", testFile.getAbsolutePath()));
}
Assert.assertFalse(testFile.exists());
CompressionUtils.gunzip(new FileInputStream(gzFile), testFile);
Assert.assertTrue(testFile.exists());
try (final InputStream inputStream = new FileInputStream(testFile)) {
assertGoodDataStream(inputStream);
}
}
use of java.util.zip.GZIPInputStream in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readBinaryModel.
/**
* Read a binary word2vec file.
*
* @param modelFile
* the File to read
* @param linebreaks
* if true, the reader expects each word/vector to be in a separate line, terminated
* by a line break
* @return a {@link Word2Vec model}
* @throws NumberFormatException
* @throws IOException
* @throws FileNotFoundException
*/
private static Word2Vec readBinaryModel(File modelFile, boolean linebreaks, boolean normalize) throws NumberFormatException, IOException {
InMemoryLookupTable<VocabWord> lookupTable;
VocabCache<VocabWord> cache;
INDArray syn0;
int words, size;
int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(false);
Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);
try (BufferedInputStream bis = new BufferedInputStream(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile));
DataInputStream dis = new DataInputStream(bis)) {
words = Integer.parseInt(readString(dis));
size = Integer.parseInt(readString(dis));
syn0 = Nd4j.create(words, size);
cache = new AbstractCache<>();
printOutProjectedMemoryUse(words, size, 1);
lookupTable = (InMemoryLookupTable<VocabWord>) new InMemoryLookupTable.Builder<VocabWord>().cache(cache).useHierarchicSoftmax(false).vectorLength(size).build();
String word;
float[] vector = new float[size];
for (int i = 0; i < words; i++) {
word = readString(dis);
log.trace("Loading " + word + " with word " + i);
for (int j = 0; j < size; j++) {
vector[j] = readFloat(dis);
}
syn0.putRow(i, normalize ? Transforms.unitVec(Nd4j.create(vector)) : Nd4j.create(vector));
VocabWord vw = new VocabWord(1.0, word);
vw.setIndex(cache.numWords());
cache.addToken(vw);
cache.addWordToIndex(vw.getIndex(), vw.getLabel());
cache.putVocabWord(word);
if (linebreaks) {
// line break
dis.readByte();
}
Nd4j.getMemoryManager().invokeGcOccasionally();
}
} finally {
if (originalPeriodic)
Nd4j.getMemoryManager().togglePeriodicGc(true);
Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);
}
lookupTable.setSyn0(syn0);
Word2Vec ret = new Word2Vec.Builder().useHierarchicSoftmax(false).resetModel(false).layerSize(syn0.columns()).allowParallelTokenization(true).elementsLearningAlgorithm(new SkipGram<VocabWord>()).learningRate(0.025).windowSize(5).workers(1).build();
ret.setVocab(cache);
ret.setLookupTable(lookupTable);
return ret;
}
use of java.util.zip.GZIPInputStream in project deeplearning4j by deeplearning4j.
the class WordVectorSerializer method readTextModel.
/**
* @param modelFile
* @return
* @throws FileNotFoundException
* @throws IOException
* @throws NumberFormatException
*/
private static Word2Vec readTextModel(File modelFile) throws IOException, NumberFormatException {
InMemoryLookupTable lookupTable;
VocabCache cache;
INDArray syn0;
Word2Vec ret = new Word2Vec();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(GzipUtils.isCompressedFilename(modelFile.getName()) ? new GZIPInputStream(new FileInputStream(modelFile)) : new FileInputStream(modelFile), "UTF-8"))) {
String line = reader.readLine();
String[] initial = line.split(" ");
int words = Integer.parseInt(initial[0]);
int layerSize = Integer.parseInt(initial[1]);
syn0 = Nd4j.create(words, layerSize);
cache = new InMemoryLookupCache(false);
int currLine = 0;
while ((line = reader.readLine()) != null) {
String[] split = line.split(" ");
assert split.length == layerSize + 1;
String word = split[0].replaceAll(whitespaceReplacement, " ");
float[] vector = new float[split.length - 1];
for (int i = 1; i < split.length; i++) {
vector[i - 1] = Float.parseFloat(split[i]);
}
syn0.putRow(currLine, Nd4j.create(vector));
cache.addWordToIndex(cache.numWords(), word);
cache.addToken(new VocabWord(1, word));
cache.putVocabWord(word);
currLine++;
}
lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder().cache(cache).vectorLength(layerSize).build();
lookupTable.setSyn0(syn0);
ret.setVocab(cache);
ret.setLookupTable(lookupTable);
}
return ret;
}
Aggregations