Search in sources :

Example 1 with BlockCompressedOutputStream

use of htsjdk.samtools.util.BlockCompressedOutputStream in project gatk by broadinstitute.

the class GatherVcfs method gatherWithBlockCopying.

/**
     * Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing
     * most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file
     * (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
     * blocks (excluding a terminator block if present) are copied directly from input to output.
     */
private static void gatherWithBlockCopying(final List<File> vcfs, final File output) {
    try (final FileOutputStream out = new FileOutputStream(output)) {
        boolean isFirstFile = true;
        for (final File f : vcfs) {
            log.info("Gathering " + f.getAbsolutePath());
            try (final FileInputStream in = new FileInputStream(f)) {
                // a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it
                final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(f);
                if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
                    throw new UserException.MalformedFile(f.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
                if (!isFirstFile) {
                    final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in, false);
                    boolean lastByteNewline = true;
                    while (in.available() > 0) {
                        // Read a block - blockIn.available() is guaranteed to return the bytes remaining in the block that has been
                        // read, and since we haven't consumed any yet, that is the block size.
                        final int blockLength = blockIn.available();
                        final byte[] blockContents = new byte[blockLength];
                        final int read = blockIn.read(blockContents);
                        Utils.validate(blockLength > 0 && read == blockLength, "Could not read available bytes from BlockCompressedInputStream.");
                        // Scan forward within the block to see if we can find the end of the header within this block
                        int firstNonHeaderByteIndex = -1;
                        for (int i = 0; i < read; ++i) {
                            final byte b = blockContents[i];
                            final boolean thisByteNewline = (b == '\n' || b == '\r');
                            if (lastByteNewline && !thisByteNewline && b != '#') {
                                // Aha!  Found first byte of non-header data in file!
                                firstNonHeaderByteIndex = i;
                                break;
                            }
                            lastByteNewline = thisByteNewline;
                        }
                        // new gzip block and then break out of the while loop
                        if (firstNonHeaderByteIndex >= 0) {
                            final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(out, null);
                            blockOut.write(blockContents, firstNonHeaderByteIndex, blockContents.length - firstNonHeaderByteIndex);
                            blockOut.flush();
                            // Don't close blockOut because closing underlying stream would break everything
                            break;
                        }
                    }
                }
                // Copy remainder of input stream into output stream
                final long currentPos = in.getChannel().position();
                final long length = f.length();
                final long skipLast = (term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) ? BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
                final long bytesToWrite = length - skipLast - currentPos;
                IOUtil.transferByStream(in, out, bytesToWrite);
                isFirstFile = false;
            }
        }
        // And lastly add the Terminator block and close up
        out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
    } catch (final IOException ioe) {
        throw new RuntimeIOException(ioe);
    }
}
Also used : RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) UserException(org.broadinstitute.hellbender.exceptions.UserException) BlockCompressedInputStream(htsjdk.samtools.util.BlockCompressedInputStream)

Example 2 with BlockCompressedOutputStream

use of htsjdk.samtools.util.BlockCompressedOutputStream in project gatk by broadinstitute.

the class SparkUtils method writeBAMHeaderToStream.

/**
     * Private helper method for {@link #convertHeaderlessHadoopBamShardToBam} that takes a SAMFileHeader and writes it
     * to the provided `OutputStream`, correctly encoded for the BAM format and preceded by the BAM magic bytes.
     *
     * @param samFileHeader SAM header to write
     * @param outputStream stream to write the SAM header to
     */
private static void writeBAMHeaderToStream(final SAMFileHeader samFileHeader, final OutputStream outputStream) {
    final BlockCompressedOutputStream blockCompressedOutputStream = new BlockCompressedOutputStream(outputStream, null);
    final BinaryCodec outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream));
    final String headerString;
    final Writer stringWriter = new StringWriter();
    new SAMTextHeaderCodec().encode(stringWriter, samFileHeader, true);
    headerString = stringWriter.toString();
    outputBinaryCodec.writeBytes(ReadUtils.BAM_MAGIC);
    // calculate and write the length of the SAM file header text and the header text
    outputBinaryCodec.writeString(headerString, true, false);
    // write the sequences binarily.  This is redundant with the text header
    outputBinaryCodec.writeInt(samFileHeader.getSequenceDictionary().size());
    for (final SAMSequenceRecord sequenceRecord : samFileHeader.getSequenceDictionary().getSequences()) {
        outputBinaryCodec.writeString(sequenceRecord.getSequenceName(), true, true);
        outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength());
    }
    try {
        blockCompressedOutputStream.flush();
    } catch (final IOException ioe) {
        throw new RuntimeIOException(ioe);
    }
}
Also used : BinaryCodec(htsjdk.samtools.util.BinaryCodec) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SAMTextHeaderCodec(htsjdk.samtools.SAMTextHeaderCodec) BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException)

Example 3 with BlockCompressedOutputStream

use of htsjdk.samtools.util.BlockCompressedOutputStream in project ASCIIGenome by dariober.

the class UcscFetch method blockCompressAndIndex.

/**
 * Block compress input file and create associated tabix index. Newly created file and index are
 * deleted on exit if deleteOnExit true.
 * @throws IOException
 * @throws InvalidRecordException
 */
private void blockCompressAndIndex(String in, String bgzfOut, boolean deleteOnExit) throws IOException, InvalidRecordException {
    File inFile = new File(in);
    File outFile = new File(bgzfOut);
    LineIterator lin = utils.IOUtils.openURIForLineIterator(inFile.getAbsolutePath());
    BlockCompressedOutputStream writer = new BlockCompressedOutputStream(outFile);
    long filePosition = writer.getFilePointer();
    TabixIndexCreator indexCreator = new TabixIndexCreator(TabixFormat.GFF);
    while (lin.hasNext()) {
        String line = lin.next();
        GtfLine gtf = new GtfLine(line.split("\t"));
        writer.write(line.getBytes());
        writer.write('\n');
        indexCreator.addFeature(gtf, filePosition);
        filePosition = writer.getFilePointer();
    }
    writer.flush();
    File tbi = new File(bgzfOut + TabixUtils.STANDARD_INDEX_EXTENSION);
    if (tbi.exists() && tbi.isFile()) {
        writer.close();
        throw new RuntimeException("Index file exists: " + tbi);
    }
    Index index = indexCreator.finalizeIndex(writer.getFilePointer());
    index.writeBasedOnFeatureFile(outFile);
    writer.close();
    if (deleteOnExit) {
        outFile.deleteOnExit();
        File idx = new File(outFile.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION);
        idx.deleteOnExit();
    }
}
Also used : GtfLine(utils.GtfLine) BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) TabixIndexCreator(htsjdk.tribble.index.tabix.TabixIndexCreator) Index(htsjdk.tribble.index.Index) File(java.io.File) LineIterator(htsjdk.tribble.readers.LineIterator)

Example 4 with BlockCompressedOutputStream

use of htsjdk.samtools.util.BlockCompressedOutputStream in project ASCIIGenome by dariober.

the class MakeTabixIndex method blockCompressAndIndex.

/**
 * Block compress input file and create associated tabix index.
 * @throws IOException
 * @throws InvalidRecordException
 */
private void blockCompressAndIndex(String intab, File bgzfOut, TabixFormat fmt) throws IOException, InvalidRecordException {
    LineIterator lin = utils.IOUtils.openURIForLineIterator(intab);
    BlockCompressedOutputStream writer = new BlockCompressedOutputStream(bgzfOut);
    long filePosition = writer.getFilePointer();
    TabixIndexCreator indexCreator = new TabixIndexCreator(fmt);
    boolean first = true;
    // This is relevant to vcf files only: Prepare header and codec
    // ------------------------------------------------------------
    VCFHeader vcfHeader = null;
    VCFCodec vcfCodec = null;
    if (fmt.equals(TabixFormat.VCF)) {
        try {
            VCFFileReader vcfr = new VCFFileReader(new File(intab), false);
            // new VCFHeader();
            vcfHeader = vcfr.getFileHeader();
            vcfr.close();
        } catch (MalformedFeatureFile e) {
            vcfHeader = new VCFHeader();
        }
        vcfCodec = new VCFCodec();
        vcfCodec.setVCFHeader(vcfHeader, Utils.getVCFHeaderVersion(vcfHeader));
    }
    // ------------------------------------------------------------
    int nWarnings = 10;
    while (lin.hasNext()) {
        String line = lin.next().trim();
        try {
            if (line.isEmpty() || line.startsWith("track ")) {
                continue;
            }
            if (line.startsWith("#")) {
                writer.write((line + "\n").getBytes());
                filePosition = writer.getFilePointer();
                continue;
            }
            if (line.startsWith("##FASTA")) {
                break;
            }
            if (first && !fmt.equals(TabixFormat.VCF)) {
                String dummy = this.makeDummyLine(line, fmt);
                addLineToIndex(dummy, indexCreator, filePosition, fmt, null, null);
                writer.write(dummy.getBytes());
                writer.write('\n');
                filePosition = writer.getFilePointer();
                first = false;
            }
            addLineToIndex(line, indexCreator, filePosition, fmt, vcfHeader, vcfCodec);
            writer.write(line.getBytes());
            writer.write('\n');
            filePosition = writer.getFilePointer();
        } catch (Exception e) {
            if (e.getMessage().contains("added out sequence of order") || e.getMessage().contains("Features added out of order")) {
                // Get a string marker for out-of-order from htsjdk/tribble/index/tabix/TabixIndexCreator.java
                throw new InvalidRecordException();
            }
            if (nWarnings >= 0) {
                System.err.println("Warning: " + e.getMessage() + ". Skipping:\n" + line);
            }
            if (nWarnings == 0) {
                System.err.println("Additional warnings will not be show.");
            }
            nWarnings--;
        }
    }
    writer.flush();
    Index index = indexCreator.finalizeIndex(writer.getFilePointer());
    index.writeBasedOnFeatureFile(bgzfOut);
    writer.close();
    CloserUtil.close(lin);
}
Also used : VCFCodec(htsjdk.variant.vcf.VCFCodec) BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) TabixIndexCreator(htsjdk.tribble.index.tabix.TabixIndexCreator) Index(htsjdk.tribble.index.Index) MalformedFeatureFile(htsjdk.tribble.TribbleException.MalformedFeatureFile) LineIterator(htsjdk.tribble.readers.LineIterator) InvalidRecordException(exceptions.InvalidRecordException) SQLException(java.sql.SQLException) IOException(java.io.IOException) VCFHeader(htsjdk.variant.vcf.VCFHeader) MalformedFeatureFile(htsjdk.tribble.TribbleException.MalformedFeatureFile) File(java.io.File) InvalidRecordException(exceptions.InvalidRecordException)

Example 5 with BlockCompressedOutputStream

use of htsjdk.samtools.util.BlockCompressedOutputStream in project jvarkit by lindenb.

the class BedIndexTabix method run.

protected void run(LineIterator in) throws IOException {
    int bedLineCount = 0;
    File tbi = new File(outputFile.getPath() + TabixUtils.STANDARD_INDEX_EXTENSION);
    BlockCompressedOutputStream writer = null;
    SortingCollection<String> sorter = null;
    final Comparator<String> comparator = new Comparator<String>() {

        @Override
        public int compare(String o1, String o2) {
            BedLine bed1 = bedCodec.decode(o1);
            BedLine bed2 = bedCodec.decode(o2);
            int i = bed1.getContig().compareTo(bed2.getContig());
            if (i != 0)
                return i;
            i = bed1.getStart() - bed2.getStart();
            if (i != 0)
                return i;
            i = bed1.getEnd() - bed2.getEnd();
            if (i != 0)
                return i;
            return o1.compareTo(o2);
        }
    };
    CloseableIterator<String> iter = null;
    try {
        TabixIndexCreator indexCreator = new TabixIndexCreator(TabixFormat.BED);
        LOG.info("Opening" + outputFile);
        writer = new BlockCompressedOutputStream(this.outputFile);
        StringBuilder header = new StringBuilder();
        while (in.hasNext()) {
            String h = in.peek();
            if (!BedLine.isBedHeader(h))
                break;
            header.append(in.next()).append('\n');
        }
        // write header
        if (header.length() > 0) {
            LOG.info("Writing header");
            writer.write(header.toString().getBytes());
        }
        if (this.sort) {
            LOG.info("Sorting");
            sorter = SortingCollection.newInstance(String.class, new BedDataCodec(), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
            while (in.hasNext()) {
                String line = in.next();
                BedLine bed = bedCodec.decode(line);
                if (bed == null)
                    continue;
                sorter.add(line);
            }
            sorter.doneAdding();
            sorter.setDestructiveIteration(true);
            iter = sorter.iterator();
            long filePosition = writer.getFilePointer();
            while (iter.hasNext()) {
                String line = iter.next();
                BedLine bed = this.bedCodec.decode(line);
                writer.write(line.getBytes());
                writer.write('\n');
                indexCreator.addFeature(bed, filePosition);
                filePosition = writer.getFilePointer();
            }
            sorter.cleanup();
        } else {
            long filePosition = writer.getFilePointer();
            while (in.hasNext()) {
                String line = in.next();
                BedLine bed = this.bedCodec.decode(line);
                if (bed == null)
                    continue;
                writer.write(line.getBytes());
                writer.write('\n');
                indexCreator.addFeature(bed, filePosition);
                filePosition = writer.getFilePointer();
            }
        }
        writer.flush();
        LOG.info("Creating index");
        Index index = indexCreator.finalizeIndex(writer.getFilePointer());
        LOG.info("Writing index to " + tbi + " using " + index.getClass());
        index.writeBasedOnFeatureFile(this.outputFile);
        writer.close();
        writer = null;
        LOG.info("Done  N=" + bedLineCount);
    } catch (Exception e) {
        if (this.outputFile.exists() && this.outputFile.isFile()) {
            LOG.warning("Deleting " + this.outputFile);
            this.outputFile.delete();
            if (tbi.exists() && tbi.isFile())
                tbi.delete();
        }
        throw new IOException(e);
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sorter);
        CloserUtil.close(writer);
    }
}
Also used : BlockCompressedOutputStream(htsjdk.samtools.util.BlockCompressedOutputStream) TabixIndexCreator(htsjdk.tribble.index.tabix.TabixIndexCreator) Index(htsjdk.tribble.index.Index) IOException(java.io.IOException) IOException(java.io.IOException) Comparator(java.util.Comparator) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) File(java.io.File)

Aggregations

BlockCompressedOutputStream (htsjdk.samtools.util.BlockCompressedOutputStream)5 Index (htsjdk.tribble.index.Index)3 TabixIndexCreator (htsjdk.tribble.index.tabix.TabixIndexCreator)3 File (java.io.File)3 RuntimeIOException (htsjdk.samtools.util.RuntimeIOException)2 LineIterator (htsjdk.tribble.readers.LineIterator)2 IOException (java.io.IOException)2 BedLine (com.github.lindenb.jvarkit.util.bio.bed.BedLine)1 InvalidRecordException (exceptions.InvalidRecordException)1 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)1 SAMTextHeaderCodec (htsjdk.samtools.SAMTextHeaderCodec)1 BinaryCodec (htsjdk.samtools.util.BinaryCodec)1 BlockCompressedInputStream (htsjdk.samtools.util.BlockCompressedInputStream)1 MalformedFeatureFile (htsjdk.tribble.TribbleException.MalformedFeatureFile)1 VCFCodec (htsjdk.variant.vcf.VCFCodec)1 VCFFileReader (htsjdk.variant.vcf.VCFFileReader)1 VCFHeader (htsjdk.variant.vcf.VCFHeader)1 SQLException (java.sql.SQLException)1 Comparator (java.util.Comparator)1 UserException (org.broadinstitute.hellbender.exceptions.UserException)1