use of org.apache.commons.compress.compressors.CompressorStreamFactory in project languagetool by languagetool-org.
the class CommonCrawlToNgram3 method indexInputFile.
private void indexInputFile() throws IOException, CompressorException {
FileInputStream fin = new FileInputStream(input);
BufferedInputStream in = new BufferedInputStream(fin);
try (CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(in)) {
final byte[] buffer = new byte[8192];
int n;
while ((n = input.read(buffer)) != -1) {
// TODO: not always correct, we need to wait for line end first?
String buf = new String(buffer, 0, n);
String[] lines = buf.split("\n");
indexLine(lines);
}
}
writeToDisk(1, unigramToCount);
writeToDisk(2, bigramToCount);
writeToDisk(3, trigramToCount);
}
use of org.apache.commons.compress.compressors.CompressorStreamFactory in project cloudstack by apache.
the class VhdProcessor method checkCompressed.
private boolean checkCompressed(String fileName) throws IOException {
FileInputStream fin = null;
BufferedInputStream bin = null;
CompressorInputStream cin = null;
try {
fin = new FileInputStream(fileName);
bin = new BufferedInputStream(fin);
cin = new CompressorStreamFactory().createCompressorInputStream(bin);
} catch (CompressorException e) {
s_logger.warn(e.getMessage());
return false;
} catch (FileNotFoundException e) {
s_logger.warn(e.getMessage());
return false;
} finally {
if (cin != null)
cin.close();
else if (bin != null)
bin.close();
}
return true;
}
use of org.apache.commons.compress.compressors.CompressorStreamFactory in project logging-log4j2 by apache.
the class CommonsCompressAction method execute.
/**
* Compresses a file.
*
* @param name the compressor name, i.e. "gz", "bzip2", "xz", "pack200", or "deflate".
* @param source file to compress, may not be null.
* @param destination compressed file, may not be null.
* @param deleteSource if true, attempt to delete file on completion. Failure to delete does not cause an exception
* to be thrown or affect return value.
*
* @return true if source file compressed.
* @throws IOException on IO exception.
*/
public static boolean execute(final String name, final File source, final File destination, final boolean deleteSource) throws IOException {
if (!source.exists()) {
return false;
}
LOGGER.debug("Starting {} compression of {}", name, source.getPath());
try (final FileInputStream input = new FileInputStream(source);
final BufferedOutputStream output = new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(name, new FileOutputStream(destination)))) {
IOUtils.copy(input, output, BUF_SIZE);
LOGGER.debug("Finished {} compression of {}", name, source.getPath());
} catch (final CompressorException e) {
throw new IOException(e);
}
if (deleteSource) {
try {
if (Files.deleteIfExists(source.toPath())) {
LOGGER.debug("Deleted {}", source.toString());
} else {
LOGGER.warn("Unable to delete {} after {} compression. File did not exist", source.toString(), name);
}
} catch (Exception ex) {
LOGGER.warn("Unable to delete {} after {} compression, {}", source.toString(), name, ex.getMessage());
}
}
return true;
}
use of org.apache.commons.compress.compressors.CompressorStreamFactory in project lucene-solr by apache.
the class StreamUtilsTest method rawGzipFile.
private Path rawGzipFile(String ext) throws Exception {
Path f = testDir.resolve("testfile." + ext);
OutputStream os = new CompressorStreamFactory().createCompressorOutputStream(CompressorStreamFactory.GZIP, Files.newOutputStream(f));
writeText(os);
return f;
}
use of org.apache.commons.compress.compressors.CompressorStreamFactory in project tika by apache.
the class CompressorParser method parse.
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// should not be closed
if (stream.markSupported()) {
stream = new CloseShieldInputStream(stream);
} else {
// Ensure that the stream supports the mark feature
stream = new BufferedInputStream(new CloseShieldInputStream(stream));
}
CompressorInputStream cis;
try {
CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {
public boolean decompressConcatenated(Metadata metadata) {
return false;
}
});
CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
cis = factory.createCompressorInputStream(stream);
} catch (CompressorException e) {
if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
}
MediaType type = getMediaType(cis);
if (!type.equals(MediaType.OCTET_STREAM)) {
metadata.set(CONTENT_TYPE, type.toString());
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
if (name.endsWith(".tbz")) {
name = name.substring(0, name.length() - 4) + ".tar";
} else if (name.endsWith(".tbz2")) {
name = name.substring(0, name.length() - 5) + ".tar";
} else if (name.endsWith(".bz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".bz2")) {
name = name.substring(0, name.length() - 4);
} else if (name.endsWith(".xz")) {
name = name.substring(0, name.length() - 3);
} else if (name.endsWith(".zlib")) {
name = name.substring(0, name.length() - 5);
} else if (name.endsWith(".pack")) {
name = name.substring(0, name.length() - 5);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(cis, xhtml, entrydata, true);
}
} finally {
cis.close();
}
xhtml.endDocument();
}
Aggregations