use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project apex-malhar by apache.
the class HdfsTestSource method start.
@Override
public void start() {
super.start();
emitTimer = new Timer();
final ChannelProcessor channelProcessor = getChannelProcessor();
emitTimer.scheduleAtFixedRate(new TimerTask() {
@Override
public void run() {
int lineCount = 0;
events.clear();
try {
while (lineCount < rate && !finished) {
String line = br.readLine();
if (line == null) {
logger.debug("completed file {}", currentFile);
br.close();
currentFile++;
if (currentFile == dataFiles.size()) {
logger.info("finished all files");
finished = true;
break;
}
Path filePath = new Path(dataFiles.get(currentFile));
br = new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(fs.open(filePath))));
logger.info("opening file {}. {}", currentFile, filePath);
continue;
}
lineCount++;
Event flumeEvent = EventBuilder.withBody(line.getBytes());
events.add(flumeEvent);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
if (events.size() > 0) {
channelProcessor.processEventBatch(events);
}
if (finished) {
emitTimer.cancel();
}
}
}, 0, 1000);
}
use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project deeplearning4j by deeplearning4j.
the class ArchiveUtils method unzipFileTo.
/**
* Extracts files to the specified destination
* @param file the file to extract to
* @param dest the destination directory
* @throws IOException
*/
public static void unzipFileTo(String file, String dest) throws IOException {
File target = new File(file);
if (!target.exists())
throw new IllegalArgumentException("Archive doesnt exist");
FileInputStream fin = new FileInputStream(target);
int BUFFER = 2048;
byte[] data = new byte[BUFFER];
if (file.endsWith(".zip")) {
//getFromOrigin the zip file content
ZipInputStream zis = new ZipInputStream(fin);
//getFromOrigin the zipped file list entry
ZipEntry ze = zis.getNextEntry();
while (ze != null) {
String fileName = ze.getName();
File newFile = new File(dest + File.separator + fileName);
log.info("file unzip : " + newFile.getAbsoluteFile());
//createComplex all non exists folders
//else you will hit FileNotFoundException for compressed folder
new File(newFile.getParent()).mkdirs();
FileOutputStream fos = new FileOutputStream(newFile);
int len;
while ((len = zis.read(data)) > 0) {
fos.write(data, 0, len);
}
fos.close();
ze = zis.getNextEntry();
}
zis.closeEntry();
zis.close();
} else if (file.endsWith(".tar.gz") || file.endsWith(".tgz")) {
BufferedInputStream in = new BufferedInputStream(fin);
GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in);
TarArchiveInputStream tarIn = new TarArchiveInputStream(gzIn);
TarArchiveEntry entry = null;
while ((entry = (TarArchiveEntry) tarIn.getNextEntry()) != null) {
log.info("Extracting: " + entry.getName());
if (entry.isDirectory()) {
File f = new File(dest + File.separator + entry.getName());
f.mkdirs();
} else /**
* If the entry is a file,write the decompressed file to the disk
* and close destination stream.
**/
{
int count;
FileOutputStream fos = new FileOutputStream(dest + File.separator + entry.getName());
BufferedOutputStream destStream = new BufferedOutputStream(fos, BUFFER);
while ((count = tarIn.read(data, 0, BUFFER)) != -1) {
destStream.write(data, 0, count);
}
destStream.flush();
IOUtils.closeQuietly(destStream);
}
}
/** Close the input stream **/
tarIn.close();
} else if (file.endsWith(".gz")) {
GZIPInputStream is2 = new GZIPInputStream(fin);
File extracted = new File(target.getParent(), target.getName().replace(".gz", ""));
if (extracted.exists())
extracted.delete();
extracted.createNewFile();
OutputStream fos = FileUtils.openOutputStream(extracted);
IOUtils.copyLarge(is2, fos);
is2.close();
fos.flush();
fos.close();
}
target.delete();
}
use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project neo4j by neo4j.
the class Loader method openArchiveIn.
private static ArchiveInputStream openArchiveIn(Path archive) throws IOException, IncorrectFormat {
InputStream input = Files.newInputStream(archive);
GzipCompressorInputStream compressor;
try {
compressor = new GzipCompressorInputStream(input);
} catch (IOException e) {
input.close();
throw new IncorrectFormat(archive, e);
}
return new TarArchiveInputStream(compressor);
}
use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project nifi by apache.
the class CompressContent method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final ComponentLog logger = getLogger();
final long sizeBeforeCompression = flowFile.getSize();
final String compressionMode = context.getProperty(MODE).getValue();
String compressionFormatValue = context.getProperty(COMPRESSION_FORMAT).getValue();
if (compressionFormatValue.equals(COMPRESSION_FORMAT_ATTRIBUTE)) {
final String mimeType = flowFile.getAttribute(CoreAttributes.MIME_TYPE.key());
if (mimeType == null) {
logger.error("No {} attribute exists for {}; routing to failure", new Object[] { CoreAttributes.MIME_TYPE.key(), flowFile });
session.transfer(flowFile, REL_FAILURE);
return;
}
compressionFormatValue = compressionFormatMimeTypeMap.get(mimeType);
if (compressionFormatValue == null) {
logger.info("Mime Type of {} is '{}', which does not indicate a supported Compression Format; routing to success without decompressing", new Object[] { flowFile, mimeType });
session.transfer(flowFile, REL_SUCCESS);
return;
}
}
final String compressionFormat = compressionFormatValue;
final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
final StopWatch stopWatch = new StopWatch(true);
final String fileExtension;
switch(compressionFormat.toLowerCase()) {
case COMPRESSION_FORMAT_GZIP:
fileExtension = ".gz";
break;
case COMPRESSION_FORMAT_LZMA:
fileExtension = ".lzma";
break;
case COMPRESSION_FORMAT_XZ_LZMA2:
fileExtension = ".xz";
break;
case COMPRESSION_FORMAT_BZIP2:
fileExtension = ".bz2";
break;
case COMPRESSION_FORMAT_SNAPPY:
fileExtension = ".snappy";
break;
case COMPRESSION_FORMAT_SNAPPY_FRAMED:
fileExtension = ".sz";
break;
default:
fileExtension = "";
break;
}
try {
flowFile = session.write(flowFile, new StreamCallback() {
@Override
public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
final OutputStream compressionOut;
final InputStream compressionIn;
final OutputStream bufferedOut = new BufferedOutputStream(rawOut, 65536);
final InputStream bufferedIn = new BufferedInputStream(rawIn, 65536);
try {
if (MODE_COMPRESS.equalsIgnoreCase(compressionMode)) {
compressionIn = bufferedIn;
switch(compressionFormat.toLowerCase()) {
case COMPRESSION_FORMAT_GZIP:
final int compressionLevel = context.getProperty(COMPRESSION_LEVEL).asInteger();
compressionOut = new GZIPOutputStream(bufferedOut, compressionLevel);
mimeTypeRef.set("application/gzip");
break;
case COMPRESSION_FORMAT_LZMA:
compressionOut = new LzmaOutputStream.Builder(bufferedOut).build();
mimeTypeRef.set("application/x-lzma");
break;
case COMPRESSION_FORMAT_XZ_LZMA2:
compressionOut = new XZOutputStream(bufferedOut, new LZMA2Options());
mimeTypeRef.set("application/x-xz");
break;
case COMPRESSION_FORMAT_SNAPPY:
compressionOut = new SnappyOutputStream(bufferedOut);
mimeTypeRef.set("application/x-snappy");
break;
case COMPRESSION_FORMAT_SNAPPY_FRAMED:
compressionOut = new SnappyFramedOutputStream(bufferedOut);
mimeTypeRef.set("application/x-snappy-framed");
break;
case COMPRESSION_FORMAT_BZIP2:
default:
mimeTypeRef.set("application/x-bzip2");
compressionOut = new CompressorStreamFactory().createCompressorOutputStream(compressionFormat.toLowerCase(), bufferedOut);
break;
}
} else {
compressionOut = bufferedOut;
switch(compressionFormat.toLowerCase()) {
case COMPRESSION_FORMAT_LZMA:
compressionIn = new LzmaInputStream(bufferedIn, new Decoder());
break;
case COMPRESSION_FORMAT_XZ_LZMA2:
compressionIn = new XZInputStream(bufferedIn);
break;
case COMPRESSION_FORMAT_BZIP2:
// need this two-arg constructor to support concatenated streams
compressionIn = new BZip2CompressorInputStream(bufferedIn, true);
break;
case COMPRESSION_FORMAT_GZIP:
compressionIn = new GzipCompressorInputStream(bufferedIn, true);
break;
case COMPRESSION_FORMAT_SNAPPY:
compressionIn = new SnappyInputStream(bufferedIn);
break;
case COMPRESSION_FORMAT_SNAPPY_FRAMED:
compressionIn = new SnappyFramedInputStream(bufferedIn);
break;
default:
compressionIn = new CompressorStreamFactory().createCompressorInputStream(compressionFormat.toLowerCase(), bufferedIn);
}
}
} catch (final Exception e) {
closeQuietly(bufferedOut);
throw new IOException(e);
}
try (final InputStream in = compressionIn;
final OutputStream out = compressionOut) {
final byte[] buffer = new byte[8192];
int len;
while ((len = in.read(buffer)) > 0) {
out.write(buffer, 0, len);
}
out.flush();
}
}
});
stopWatch.stop();
final long sizeAfterCompression = flowFile.getSize();
if (MODE_DECOMPRESS.equalsIgnoreCase(compressionMode)) {
flowFile = session.removeAttribute(flowFile, CoreAttributes.MIME_TYPE.key());
if (context.getProperty(UPDATE_FILENAME).asBoolean()) {
final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
if (filename.toLowerCase().endsWith(fileExtension)) {
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), filename.substring(0, filename.length() - fileExtension.length()));
}
}
} else {
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get());
if (context.getProperty(UPDATE_FILENAME).asBoolean()) {
final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), filename + fileExtension);
}
}
logger.info("Successfully {}ed {} using {} compression format; size changed from {} to {} bytes", new Object[] { compressionMode.toLowerCase(), flowFile, compressionFormat, sizeBeforeCompression, sizeAfterCompression });
session.getProvenanceReporter().modifyContent(flowFile, stopWatch.getDuration(TimeUnit.MILLISECONDS));
session.transfer(flowFile, REL_SUCCESS);
} catch (final ProcessException e) {
logger.error("Unable to {} {} using {} compression format due to {}; routing to failure", new Object[] { compressionMode.toLowerCase(), flowFile, compressionFormat, e });
session.transfer(flowFile, REL_FAILURE);
}
}
use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project mmtf-spark by sbl-sdsc.
the class JpredDataset method getDataset.
/**
* Gets JPred 4/JNet (v.2.3.1) secondary structure dataset.
*
* @return secondary structure dataset
* @throws IOException
* if file cannot be downloaded or read
*/
public static Dataset<Row> getDataset() throws IOException {
List<Row> res = new ArrayList<Row>();
URL u = new URL(URL);
URLConnection conn = u.openConnection();
InputStream in = conn.getInputStream();
BufferedInputStream fin = new BufferedInputStream(in);
GzipCompressorInputStream gzIn = new GzipCompressorInputStream(fin);
TarArchiveInputStream tarIn = new TarArchiveInputStream(gzIn);
TarArchiveEntry entry = null;
Set<String> scopIDs = new HashSet<>();
Map<String, String> sequences = new HashMap<String, String>();
Map<String, String> secondaryStructures = new HashMap<String, String>();
Map<String, String> trained = new HashMap<String, String>();
while ((entry = (TarArchiveEntry) tarIn.getNextEntry()) != null) {
if (entry.isDirectory()) {
continue;
}
BufferedReader br = new BufferedReader(new InputStreamReader(tarIn));
if (entry.getName().contains(".dssp")) {
String scopID = br.readLine().substring(1);
String secondaryStructure = br.readLine();
secondaryStructure = secondaryStructure.replace("-", "C");
secondaryStructures.put(scopID, secondaryStructure);
} else if (entry.getName().contains(".fasta")) {
String scopID = br.readLine().substring(1);
String sequence = br.readLine();
scopIDs.add(scopID);
sequences.put(scopID, sequence);
if (entry.getName().contains("training/"))
trained.put(scopID, "true");
else if (entry.getName().contains("blind/"))
trained.put(scopID, "false");
}
}
tarIn.close();
Iterator<String> iter = scopIDs.iterator();
while (iter.hasNext()) {
String scopID = iter.next();
res.add(RowFactory.create(scopID, sequences.get(scopID), secondaryStructures.get(scopID), trained.get(scopID)));
}
SparkSession spark = SparkSession.builder().getOrCreate();
@SuppressWarnings("resource") JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Row> data = sc.parallelize(res);
return JavaRDDToDataset.getDataset(data, "scopID", "sequence", "secondaryStructure", "trained");
}
Aggregations