use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project carbondata by apache.
the class FileFactory method getDataInputStream.
public static DataInputStream getDataInputStream(String path, FileType fileType, int bufferSize) throws IOException {
path = path.replace("\\", "/");
boolean gzip = path.endsWith(".gz");
boolean bzip2 = path.endsWith(".bz2");
InputStream stream;
switch(fileType) {
case LOCAL:
path = getUpdatedFilePath(path, fileType);
if (gzip) {
stream = new GZIPInputStream(new FileInputStream(path));
} else if (bzip2) {
stream = new BZip2CompressorInputStream(new FileInputStream(path));
} else {
stream = new FileInputStream(path);
}
break;
case HDFS:
case ALLUXIO:
case VIEWFS:
Path pt = new Path(path);
FileSystem fs = pt.getFileSystem(configuration);
if (bufferSize == -1) {
stream = fs.open(pt);
} else {
stream = fs.open(pt, bufferSize);
}
String codecName = null;
if (gzip) {
codecName = GzipCodec.class.getName();
} else if (bzip2) {
codecName = BZip2Codec.class.getName();
}
if (null != codecName) {
CompressionCodecFactory ccf = new CompressionCodecFactory(configuration);
CompressionCodec codec = ccf.getCodecByClassName(codecName);
stream = codec.createInputStream(stream);
}
break;
default:
throw new UnsupportedOperationException("unsupported file system");
}
return new DataInputStream(new BufferedInputStream(stream));
}
use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project beam by apache.
the class FileBasedSinkTest method testCompressionTypeBZIP2.
/** {@link CompressionType#BZIP2} correctly writes BZip2 data. */
@Test
public void testCompressionTypeBZIP2() throws FileNotFoundException, IOException {
final File file = writeValuesWithWritableByteChannelFactory(CompressionType.BZIP2, "abc", "123");
// Read Bzip2ed data back in using Apache commons API (de facto standard).
assertReadValues(new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(new FileInputStream(file)), StandardCharsets.UTF_8.name())), "abc", "123");
}
use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project stanbol by apache.
the class MultiThreadedTestBase method initTestData.
/**
* Helper method that initialises the test data based on the parsed parameter
* @param settings the settings of the Test.
* @return the Iterator over the contents in the test data
* @throws IOException on any error while accessing the parsed test data
*/
private Iterator<String> initTestData(TestSettings settings) throws IOException {
log.info("Read Testdata from '{}'", settings.getTestData());
File testFile = new File(settings.getTestData());
InputStream is = null;
if (testFile.isFile()) {
log.info(" ... init from File");
is = new FileInputStream(testFile);
}
if (is == null) {
is = MultiThreadedTest.class.getClassLoader().getResourceAsStream(settings.getTestData());
}
if (is == null) {
is = ClassLoader.getSystemResourceAsStream(settings.getTestData());
}
if (is == null) {
try {
is = new URL(settings.getTestData()).openStream();
log.info(" ... init from URL");
} catch (MalformedURLException e) {
//not a URL
}
} else {
log.info(" ... init via Classpath");
}
Assert.assertNotNull("Unable to load the parsed TestData '" + settings.getTestData() + "'!", is);
log.info(" - InputStream: {}", is == null ? null : is.getClass().getSimpleName());
String name = FilenameUtils.getName(settings.getTestData());
if ("gz".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
is = new GZIPInputStream(is);
name = FilenameUtils.removeExtension(name);
log.debug(" - from GZIP Archive");
} else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
is = new BZip2CompressorInputStream(is);
name = FilenameUtils.removeExtension(name);
log.debug(" - from BZip2 Archive");
} else if ("zip".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
ZipArchiveInputStream zipin = new ZipArchiveInputStream(is);
ArchiveEntry entry = zipin.getNextEntry();
log.info("For ZIP archives only the 1st Entry will be processed!");
name = FilenameUtils.getName(entry.getName());
log.info(" - processed Entry: {}", entry.getName());
} else {
// else uncompressed data ...
log.info(" - uncompressed source: {}", name);
}
String mediaType;
if (settings.getTestDataMediaType() != null) {
mediaType = settings.getTestDataMediaType();
} else {
//parse based on extension
String ext = FilenameUtils.getExtension(name);
if ("txt".equalsIgnoreCase(ext)) {
mediaType = TEXT_PLAIN;
} else if ("rdf".equalsIgnoreCase(ext)) {
mediaType = SupportedFormat.RDF_XML;
} else if ("xml".equalsIgnoreCase(ext)) {
mediaType = SupportedFormat.RDF_XML;
} else if ("ttl".equalsIgnoreCase(ext)) {
mediaType = SupportedFormat.TURTLE;
} else if ("n3".equalsIgnoreCase(ext)) {
mediaType = SupportedFormat.N3;
} else if ("nt".equalsIgnoreCase(ext)) {
mediaType = SupportedFormat.N_TRIPLE;
} else if ("json".equalsIgnoreCase(ext)) {
mediaType = SupportedFormat.RDF_JSON;
} else if (name.indexOf('.') < 0) {
//no extension
//try plain text
mediaType = TEXT_PLAIN;
} else {
log.info("Unkown File Extension {} for resource name {}", ext, name);
mediaType = null;
}
}
Assert.assertNotNull("Unable to detect MediaType for RDFTerm '" + name + "'. Please use the property '" + PROPERTY_TEST_DATA_TYPE + "' to manually parse the MediaType!", mediaType);
log.info(" - Media-Type: {}", mediaType);
//now init the iterator for the test data
return TEXT_PLAIN.equalsIgnoreCase(mediaType) ? createTextDataIterator(is, mediaType) : createRdfDataIterator(is, mediaType, settings.getContentProperty());
}
use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project stanbol by apache.
the class Urify method urify.
private void urify(String resource) throws IOException {
File source = new File(resource);
if (source.isFile()) {
String path = FilenameUtils.getFullPathNoEndSeparator(resource);
String name = FilenameUtils.getName(resource);
File target = new File(path, outputFilePrefix + name);
int i = 0;
while (target.exists()) {
i++;
target = new File(path, "uf" + i + "_" + name);
}
InputStream is = new FileInputStream(source);
OutputStream os = new FileOutputStream(target);
log.info("RDFTerm: {}", resource);
log.info("Target : {}", target);
if ("gz".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
is = new GZIPInputStream(is);
os = new GZIPOutputStream(os);
name = FilenameUtils.removeExtension(name);
log.debug(" - from GZIP Archive");
} else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
is = new BZip2CompressorInputStream(is);
os = new BZip2CompressorOutputStream(os);
name = FilenameUtils.removeExtension(name);
log.debug(" - from BZip2 Archive");
}
// TODO: No Zip File support
//else no complression
BlockingQueue<String> queue = new ArrayBlockingQueue<String>(1000);
ReaderDaemon reader = new ReaderDaemon(new BufferedReader(new InputStreamReader(is, charset)), queue);
WriterDaemon writer = new WriterDaemon(new BufferedWriter(new OutputStreamWriter(os, charset)), queue);
Thread readerDaemon = new Thread(reader, name + " reader");
Thread writerDaemon = new Thread(writer, name + " writer");
readerDaemon.setDaemon(true);
writerDaemon.setDaemon(true);
writerDaemon.start();
readerDaemon.start();
Object notifier = writer.getNotifier();
synchronized (notifier) {
//wait until processed
if (!writer.completed()) {
try {
notifier.wait();
} catch (InterruptedException e) {
/*ignore*/
}
}
}
if (reader.getError() != null) {
throw new IOException("Error while reading source " + source, reader.getError());
}
if (writer.getError() != null) {
throw new IOException("Error while writing resource " + target, writer.getError());
}
log.info(" ... completed resource {}", resource);
} else {
throw new FileNotFoundException("Parsed File " + resource + " does not exist or is not a File!");
}
}
use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project stanbol by apache.
the class IndexerImpl method getEntityIdFileInputStream.
/**
* Opens a stream to read data from the {@link #indexedEntityIdFile}.
* Can only be called in {@link State}s later that {@link State#INDEXED}.
* @return the stream
* @throws IOException on any error while creating the stream
* @throws IllegalStateException if {@link #getState()} is earlier than
* {@link State#INDEXED}
*/
protected InputStream getEntityIdFileInputStream() throws IOException {
if (indexedEntityIdFile == null) {
return null;
}
State state = getState();
if (state.ordinal() < State.INDEXED.ordinal()) {
throw new IllegalStateException("The indexed entity id data is not" + "available for states < " + State.INDEXED + " (current: " + state + ")!");
}
//support compression
String extension = FilenameUtils.getExtension(indexedEntityIdFile.getName());
InputStream in = new FileInputStream(indexedEntityIdFile);
if ("zip".equalsIgnoreCase(extension)) {
in = new ZipInputStream(in);
((ZipInputStream) in).getNextEntry();
} else if ("gz".equalsIgnoreCase(extension)) {
in = new GZIPInputStream(in);
} else if ("bz2".equalsIgnoreCase(extension)) {
in = new BZip2CompressorInputStream(in);
}
return in;
}
Aggregations