Search in sources :

Example 16 with BZip2CompressorInputStream

use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project carbondata by apache.

the class FileFactory method getDataInputStream.

public static DataInputStream getDataInputStream(String path, FileType fileType, int bufferSize) throws IOException {
    path = path.replace("\\", "/");
    boolean gzip = path.endsWith(".gz");
    boolean bzip2 = path.endsWith(".bz2");
    InputStream stream;
    switch(fileType) {
        case LOCAL:
            path = getUpdatedFilePath(path, fileType);
            if (gzip) {
                stream = new GZIPInputStream(new FileInputStream(path));
            } else if (bzip2) {
                stream = new BZip2CompressorInputStream(new FileInputStream(path));
            } else {
                stream = new FileInputStream(path);
            }
            break;
        case HDFS:
        case ALLUXIO:
        case VIEWFS:
            Path pt = new Path(path);
            FileSystem fs = pt.getFileSystem(configuration);
            if (bufferSize == -1) {
                stream = fs.open(pt);
            } else {
                stream = fs.open(pt, bufferSize);
            }
            String codecName = null;
            if (gzip) {
                codecName = GzipCodec.class.getName();
            } else if (bzip2) {
                codecName = BZip2Codec.class.getName();
            }
            if (null != codecName) {
                CompressionCodecFactory ccf = new CompressionCodecFactory(configuration);
                CompressionCodec codec = ccf.getCodecByClassName(codecName);
                stream = codec.createInputStream(stream);
            }
            break;
        default:
            throw new UnsupportedOperationException("unsupported file system");
    }
    return new DataInputStream(new BufferedInputStream(stream));
}
Also used : Path(org.apache.hadoop.fs.Path) DataInputStream(java.io.DataInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) InputStream(java.io.InputStream) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) DataInputStream(java.io.DataInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileInputStream(java.io.FileInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) BufferedInputStream(java.io.BufferedInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 17 with BZip2CompressorInputStream

use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project beam by apache.

the class FileBasedSinkTest method testCompressionTypeBZIP2.

/** {@link CompressionType#BZIP2} correctly writes BZip2 data. */
@Test
public void testCompressionTypeBZIP2() throws FileNotFoundException, IOException {
    final File file = writeValuesWithWritableByteChannelFactory(CompressionType.BZIP2, "abc", "123");
    // Read Bzip2ed data back in using Apache commons API (de facto standard).
    assertReadValues(new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(new FileInputStream(file)), StandardCharsets.UTF_8.name())), "abc", "123");
}
Also used : InputStreamReader(java.io.InputStreamReader) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) BufferedReader(java.io.BufferedReader) File(java.io.File) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 18 with BZip2CompressorInputStream

use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project stanbol by apache.

the class MultiThreadedTestBase method initTestData.

/**
     * Helper method that initialises the test data based on the parsed parameter
     * @param settings the settings of the Test.
     * @return the Iterator over the contents in the test data
     * @throws IOException on any error while accessing the parsed test data
     */
private Iterator<String> initTestData(TestSettings settings) throws IOException {
    log.info("Read Testdata from '{}'", settings.getTestData());
    File testFile = new File(settings.getTestData());
    InputStream is = null;
    if (testFile.isFile()) {
        log.info(" ... init from File");
        is = new FileInputStream(testFile);
    }
    if (is == null) {
        is = MultiThreadedTest.class.getClassLoader().getResourceAsStream(settings.getTestData());
    }
    if (is == null) {
        is = ClassLoader.getSystemResourceAsStream(settings.getTestData());
    }
    if (is == null) {
        try {
            is = new URL(settings.getTestData()).openStream();
            log.info(" ... init from URL");
        } catch (MalformedURLException e) {
        //not a URL
        }
    } else {
        log.info(" ... init via Classpath");
    }
    Assert.assertNotNull("Unable to load the parsed TestData '" + settings.getTestData() + "'!", is);
    log.info("  - InputStream: {}", is == null ? null : is.getClass().getSimpleName());
    String name = FilenameUtils.getName(settings.getTestData());
    if ("gz".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
        is = new GZIPInputStream(is);
        name = FilenameUtils.removeExtension(name);
        log.debug("   - from GZIP Archive");
    } else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
        is = new BZip2CompressorInputStream(is);
        name = FilenameUtils.removeExtension(name);
        log.debug("   - from BZip2 Archive");
    } else if ("zip".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
        ZipArchiveInputStream zipin = new ZipArchiveInputStream(is);
        ArchiveEntry entry = zipin.getNextEntry();
        log.info("For ZIP archives only the 1st Entry will be processed!");
        name = FilenameUtils.getName(entry.getName());
        log.info("  - processed Entry: {}", entry.getName());
    } else {
        // else uncompressed data ...
        log.info("  - uncompressed source: {}", name);
    }
    String mediaType;
    if (settings.getTestDataMediaType() != null) {
        mediaType = settings.getTestDataMediaType();
    } else {
        //parse based on extension
        String ext = FilenameUtils.getExtension(name);
        if ("txt".equalsIgnoreCase(ext)) {
            mediaType = TEXT_PLAIN;
        } else if ("rdf".equalsIgnoreCase(ext)) {
            mediaType = SupportedFormat.RDF_XML;
        } else if ("xml".equalsIgnoreCase(ext)) {
            mediaType = SupportedFormat.RDF_XML;
        } else if ("ttl".equalsIgnoreCase(ext)) {
            mediaType = SupportedFormat.TURTLE;
        } else if ("n3".equalsIgnoreCase(ext)) {
            mediaType = SupportedFormat.N3;
        } else if ("nt".equalsIgnoreCase(ext)) {
            mediaType = SupportedFormat.N_TRIPLE;
        } else if ("json".equalsIgnoreCase(ext)) {
            mediaType = SupportedFormat.RDF_JSON;
        } else if (name.indexOf('.') < 0) {
            //no extension
            //try plain text
            mediaType = TEXT_PLAIN;
        } else {
            log.info("Unkown File Extension {} for resource name {}", ext, name);
            mediaType = null;
        }
    }
    Assert.assertNotNull("Unable to detect MediaType for RDFTerm '" + name + "'. Please use the property '" + PROPERTY_TEST_DATA_TYPE + "' to manually parse the MediaType!", mediaType);
    log.info("  - Media-Type: {}", mediaType);
    //now init the iterator for the test data
    return TEXT_PLAIN.equalsIgnoreCase(mediaType) ? createTextDataIterator(is, mediaType) : createRdfDataIterator(is, mediaType, settings.getContentProperty());
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) MalformedURLException(java.net.MalformedURLException) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) FileInputStream(java.io.FileInputStream) ZipArchiveInputStream(org.apache.commons.compress.archivers.zip.ZipArchiveInputStream) InputStream(java.io.InputStream) ArchiveEntry(org.apache.commons.compress.archivers.ArchiveEntry) File(java.io.File) FileInputStream(java.io.FileInputStream) URL(java.net.URL)

Example 19 with BZip2CompressorInputStream

use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project stanbol by apache.

the class Urify method urify.

private void urify(String resource) throws IOException {
    File source = new File(resource);
    if (source.isFile()) {
        String path = FilenameUtils.getFullPathNoEndSeparator(resource);
        String name = FilenameUtils.getName(resource);
        File target = new File(path, outputFilePrefix + name);
        int i = 0;
        while (target.exists()) {
            i++;
            target = new File(path, "uf" + i + "_" + name);
        }
        InputStream is = new FileInputStream(source);
        OutputStream os = new FileOutputStream(target);
        log.info("RDFTerm: {}", resource);
        log.info("Target  : {}", target);
        if ("gz".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
            is = new GZIPInputStream(is);
            os = new GZIPOutputStream(os);
            name = FilenameUtils.removeExtension(name);
            log.debug("   - from GZIP Archive");
        } else if ("bz2".equalsIgnoreCase(FilenameUtils.getExtension(name))) {
            is = new BZip2CompressorInputStream(is);
            os = new BZip2CompressorOutputStream(os);
            name = FilenameUtils.removeExtension(name);
            log.debug("   - from BZip2 Archive");
        }
        // TODO: No Zip File support
        //else no complression
        BlockingQueue<String> queue = new ArrayBlockingQueue<String>(1000);
        ReaderDaemon reader = new ReaderDaemon(new BufferedReader(new InputStreamReader(is, charset)), queue);
        WriterDaemon writer = new WriterDaemon(new BufferedWriter(new OutputStreamWriter(os, charset)), queue);
        Thread readerDaemon = new Thread(reader, name + " reader");
        Thread writerDaemon = new Thread(writer, name + " writer");
        readerDaemon.setDaemon(true);
        writerDaemon.setDaemon(true);
        writerDaemon.start();
        readerDaemon.start();
        Object notifier = writer.getNotifier();
        synchronized (notifier) {
            //wait until processed
            if (!writer.completed()) {
                try {
                    notifier.wait();
                } catch (InterruptedException e) {
                /*ignore*/
                }
            }
        }
        if (reader.getError() != null) {
            throw new IOException("Error while reading source " + source, reader.getError());
        }
        if (writer.getError() != null) {
            throw new IOException("Error while writing resource " + target, writer.getError());
        }
        log.info(" ... completed resource {}", resource);
    } else {
        throw new FileNotFoundException("Parsed File " + resource + " does not exist or is not a File!");
    }
}
Also used : BZip2CompressorOutputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream) InputStreamReader(java.io.InputStreamReader) GZIPInputStream(java.util.zip.GZIPInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) BZip2CompressorOutputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) GZIPOutputStream(java.util.zip.GZIPOutputStream) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BufferedWriter(java.io.BufferedWriter) GZIPInputStream(java.util.zip.GZIPInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) GZIPOutputStream(java.util.zip.GZIPOutputStream) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File)

Example 20 with BZip2CompressorInputStream

use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project stanbol by apache.

the class IndexerImpl method getEntityIdFileInputStream.

/**
     * Opens a stream to read data from the {@link #indexedEntityIdFile}. 
     * Can only be called in {@link State}s later that {@link State#INDEXED}.
     * @return the stream
     * @throws IOException on any error while creating the stream
     * @throws IllegalStateException if {@link #getState()} is earlier than
     * {@link State#INDEXED}
     */
protected InputStream getEntityIdFileInputStream() throws IOException {
    if (indexedEntityIdFile == null) {
        return null;
    }
    State state = getState();
    if (state.ordinal() < State.INDEXED.ordinal()) {
        throw new IllegalStateException("The indexed entity id data is not" + "available for states < " + State.INDEXED + " (current: " + state + ")!");
    }
    //support compression
    String extension = FilenameUtils.getExtension(indexedEntityIdFile.getName());
    InputStream in = new FileInputStream(indexedEntityIdFile);
    if ("zip".equalsIgnoreCase(extension)) {
        in = new ZipInputStream(in);
        ((ZipInputStream) in).getNextEntry();
    } else if ("gz".equalsIgnoreCase(extension)) {
        in = new GZIPInputStream(in);
    } else if ("bz2".equalsIgnoreCase(extension)) {
        in = new BZip2CompressorInputStream(in);
    }
    return in;
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) ZipInputStream(java.util.zip.ZipInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ZipInputStream(java.util.zip.ZipInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileInputStream(java.io.FileInputStream)

Aggregations

BZip2CompressorInputStream (org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream)21 FileInputStream (java.io.FileInputStream)12 IOException (java.io.IOException)7 InputStream (java.io.InputStream)7 File (java.io.File)6 GZIPInputStream (java.util.zip.GZIPInputStream)6 FileOutputStream (java.io.FileOutputStream)5 BufferedReader (java.io.BufferedReader)4 InputStreamReader (java.io.InputStreamReader)4 GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)4 BufferedInputStream (java.io.BufferedInputStream)3 TarArchiveInputStream (org.apache.commons.compress.archivers.tar.TarArchiveInputStream)3 FileNotFoundException (java.io.FileNotFoundException)2 FileWriter (java.io.FileWriter)2 OutputStream (java.io.OutputStream)2 URL (java.net.URL)2 ByteBuffer (java.nio.ByteBuffer)2 ArchiveInputStream (org.apache.commons.compress.archivers.ArchiveInputStream)2 ZipArchiveInputStream (org.apache.commons.compress.archivers.zip.ZipArchiveInputStream)2 CompressorInputStream (org.apache.commons.compress.compressors.CompressorInputStream)2