use of htsjdk.samtools.util.BlockCompressedOutputStream in project gatk by broadinstitute.
the class GatherVcfs method gatherWithBlockCopying.
/**
* Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing
* most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file
* (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
* blocks (excluding a terminator block if present) are copied directly from input to output.
*/
private static void gatherWithBlockCopying(final List<File> vcfs, final File output) {
try (final FileOutputStream out = new FileOutputStream(output)) {
boolean isFirstFile = true;
for (final File f : vcfs) {
log.info("Gathering " + f.getAbsolutePath());
try (final FileInputStream in = new FileInputStream(f)) {
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(f);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new UserException.MalformedFile(f.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
if (!isFirstFile) {
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in, false);
boolean lastByteNewline = true;
while (in.available() > 0) {
// Read a block - blockIn.available() is guaranteed to return the bytes remaining in the block that has been
// read, and since we haven't consumed any yet, that is the block size.
final int blockLength = blockIn.available();
final byte[] blockContents = new byte[blockLength];
final int read = blockIn.read(blockContents);
Utils.validate(blockLength > 0 && read == blockLength, "Could not read available bytes from BlockCompressedInputStream.");
// Scan forward within the block to see if we can find the end of the header within this block
int firstNonHeaderByteIndex = -1;
for (int i = 0; i < read; ++i) {
final byte b = blockContents[i];
final boolean thisByteNewline = (b == '\n' || b == '\r');
if (lastByteNewline && !thisByteNewline && b != '#') {
// Aha! Found first byte of non-header data in file!
firstNonHeaderByteIndex = i;
break;
}
lastByteNewline = thisByteNewline;
}
// new gzip block and then break out of the while loop
if (firstNonHeaderByteIndex >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(out, null);
blockOut.write(blockContents, firstNonHeaderByteIndex, blockContents.length - firstNonHeaderByteIndex);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
break;
}
}
}
// Copy remainder of input stream into output stream
final long currentPos = in.getChannel().position();
final long length = f.length();
final long skipLast = (term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) ? BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;
IOUtil.transferByStream(in, out, bytesToWrite);
isFirstFile = false;
}
}
// And lastly add the Terminator block and close up
out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
use of htsjdk.samtools.util.BlockCompressedOutputStream in project gatk by broadinstitute.
the class SparkUtils method writeBAMHeaderToStream.
/**
* Private helper method for {@link #convertHeaderlessHadoopBamShardToBam} that takes a SAMFileHeader and writes it
* to the provided `OutputStream`, correctly encoded for the BAM format and preceded by the BAM magic bytes.
*
* @param samFileHeader SAM header to write
* @param outputStream stream to write the SAM header to
*/
private static void writeBAMHeaderToStream(final SAMFileHeader samFileHeader, final OutputStream outputStream) {
final BlockCompressedOutputStream blockCompressedOutputStream = new BlockCompressedOutputStream(outputStream, null);
final BinaryCodec outputBinaryCodec = new BinaryCodec(new DataOutputStream(blockCompressedOutputStream));
final String headerString;
final Writer stringWriter = new StringWriter();
new SAMTextHeaderCodec().encode(stringWriter, samFileHeader, true);
headerString = stringWriter.toString();
outputBinaryCodec.writeBytes(ReadUtils.BAM_MAGIC);
// calculate and write the length of the SAM file header text and the header text
outputBinaryCodec.writeString(headerString, true, false);
// write the sequences binarily. This is redundant with the text header
outputBinaryCodec.writeInt(samFileHeader.getSequenceDictionary().size());
for (final SAMSequenceRecord sequenceRecord : samFileHeader.getSequenceDictionary().getSequences()) {
outputBinaryCodec.writeString(sequenceRecord.getSequenceName(), true, true);
outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength());
}
try {
blockCompressedOutputStream.flush();
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
use of htsjdk.samtools.util.BlockCompressedOutputStream in project ASCIIGenome by dariober.
the class UcscFetch method blockCompressAndIndex.
/**
* Block compress input file and create associated tabix index. Newly created file and index are
* deleted on exit if deleteOnExit true.
* @throws IOException
* @throws InvalidRecordException
*/
private void blockCompressAndIndex(String in, String bgzfOut, boolean deleteOnExit) throws IOException, InvalidRecordException {
File inFile = new File(in);
File outFile = new File(bgzfOut);
LineIterator lin = utils.IOUtils.openURIForLineIterator(inFile.getAbsolutePath());
BlockCompressedOutputStream writer = new BlockCompressedOutputStream(outFile);
long filePosition = writer.getFilePointer();
TabixIndexCreator indexCreator = new TabixIndexCreator(TabixFormat.GFF);
while (lin.hasNext()) {
String line = lin.next();
GtfLine gtf = new GtfLine(line.split("\t"));
writer.write(line.getBytes());
writer.write('\n');
indexCreator.addFeature(gtf, filePosition);
filePosition = writer.getFilePointer();
}
writer.flush();
File tbi = new File(bgzfOut + TabixUtils.STANDARD_INDEX_EXTENSION);
if (tbi.exists() && tbi.isFile()) {
writer.close();
throw new RuntimeException("Index file exists: " + tbi);
}
Index index = indexCreator.finalizeIndex(writer.getFilePointer());
index.writeBasedOnFeatureFile(outFile);
writer.close();
if (deleteOnExit) {
outFile.deleteOnExit();
File idx = new File(outFile.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION);
idx.deleteOnExit();
}
}
use of htsjdk.samtools.util.BlockCompressedOutputStream in project ASCIIGenome by dariober.
the class MakeTabixIndex method blockCompressAndIndex.
/**
* Block compress input file and create associated tabix index.
* @throws IOException
* @throws InvalidRecordException
*/
private void blockCompressAndIndex(String intab, File bgzfOut, TabixFormat fmt) throws IOException, InvalidRecordException {
LineIterator lin = utils.IOUtils.openURIForLineIterator(intab);
BlockCompressedOutputStream writer = new BlockCompressedOutputStream(bgzfOut);
long filePosition = writer.getFilePointer();
TabixIndexCreator indexCreator = new TabixIndexCreator(fmt);
boolean first = true;
// This is relevant to vcf files only: Prepare header and codec
// ------------------------------------------------------------
VCFHeader vcfHeader = null;
VCFCodec vcfCodec = null;
if (fmt.equals(TabixFormat.VCF)) {
try {
VCFFileReader vcfr = new VCFFileReader(new File(intab), false);
// new VCFHeader();
vcfHeader = vcfr.getFileHeader();
vcfr.close();
} catch (MalformedFeatureFile e) {
vcfHeader = new VCFHeader();
}
vcfCodec = new VCFCodec();
vcfCodec.setVCFHeader(vcfHeader, Utils.getVCFHeaderVersion(vcfHeader));
}
// ------------------------------------------------------------
int nWarnings = 10;
while (lin.hasNext()) {
String line = lin.next().trim();
try {
if (line.isEmpty() || line.startsWith("track ")) {
continue;
}
if (line.startsWith("#")) {
writer.write((line + "\n").getBytes());
filePosition = writer.getFilePointer();
continue;
}
if (line.startsWith("##FASTA")) {
break;
}
if (first && !fmt.equals(TabixFormat.VCF)) {
String dummy = this.makeDummyLine(line, fmt);
addLineToIndex(dummy, indexCreator, filePosition, fmt, null, null);
writer.write(dummy.getBytes());
writer.write('\n');
filePosition = writer.getFilePointer();
first = false;
}
addLineToIndex(line, indexCreator, filePosition, fmt, vcfHeader, vcfCodec);
writer.write(line.getBytes());
writer.write('\n');
filePosition = writer.getFilePointer();
} catch (Exception e) {
if (e.getMessage().contains("added out sequence of order") || e.getMessage().contains("Features added out of order")) {
// Get a string marker for out-of-order from htsjdk/tribble/index/tabix/TabixIndexCreator.java
throw new InvalidRecordException();
}
if (nWarnings >= 0) {
System.err.println("Warning: " + e.getMessage() + ". Skipping:\n" + line);
}
if (nWarnings == 0) {
System.err.println("Additional warnings will not be show.");
}
nWarnings--;
}
}
writer.flush();
Index index = indexCreator.finalizeIndex(writer.getFilePointer());
index.writeBasedOnFeatureFile(bgzfOut);
writer.close();
CloserUtil.close(lin);
}
use of htsjdk.samtools.util.BlockCompressedOutputStream in project jvarkit by lindenb.
the class BedIndexTabix method run.
protected void run(LineIterator in) throws IOException {
int bedLineCount = 0;
File tbi = new File(outputFile.getPath() + TabixUtils.STANDARD_INDEX_EXTENSION);
BlockCompressedOutputStream writer = null;
SortingCollection<String> sorter = null;
final Comparator<String> comparator = new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
BedLine bed1 = bedCodec.decode(o1);
BedLine bed2 = bedCodec.decode(o2);
int i = bed1.getContig().compareTo(bed2.getContig());
if (i != 0)
return i;
i = bed1.getStart() - bed2.getStart();
if (i != 0)
return i;
i = bed1.getEnd() - bed2.getEnd();
if (i != 0)
return i;
return o1.compareTo(o2);
}
};
CloseableIterator<String> iter = null;
try {
TabixIndexCreator indexCreator = new TabixIndexCreator(TabixFormat.BED);
LOG.info("Opening" + outputFile);
writer = new BlockCompressedOutputStream(this.outputFile);
StringBuilder header = new StringBuilder();
while (in.hasNext()) {
String h = in.peek();
if (!BedLine.isBedHeader(h))
break;
header.append(in.next()).append('\n');
}
// write header
if (header.length() > 0) {
LOG.info("Writing header");
writer.write(header.toString().getBytes());
}
if (this.sort) {
LOG.info("Sorting");
sorter = SortingCollection.newInstance(String.class, new BedDataCodec(), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
while (in.hasNext()) {
String line = in.next();
BedLine bed = bedCodec.decode(line);
if (bed == null)
continue;
sorter.add(line);
}
sorter.doneAdding();
sorter.setDestructiveIteration(true);
iter = sorter.iterator();
long filePosition = writer.getFilePointer();
while (iter.hasNext()) {
String line = iter.next();
BedLine bed = this.bedCodec.decode(line);
writer.write(line.getBytes());
writer.write('\n');
indexCreator.addFeature(bed, filePosition);
filePosition = writer.getFilePointer();
}
sorter.cleanup();
} else {
long filePosition = writer.getFilePointer();
while (in.hasNext()) {
String line = in.next();
BedLine bed = this.bedCodec.decode(line);
if (bed == null)
continue;
writer.write(line.getBytes());
writer.write('\n');
indexCreator.addFeature(bed, filePosition);
filePosition = writer.getFilePointer();
}
}
writer.flush();
LOG.info("Creating index");
Index index = indexCreator.finalizeIndex(writer.getFilePointer());
LOG.info("Writing index to " + tbi + " using " + index.getClass());
index.writeBasedOnFeatureFile(this.outputFile);
writer.close();
writer = null;
LOG.info("Done N=" + bedLineCount);
} catch (Exception e) {
if (this.outputFile.exists() && this.outputFile.isFile()) {
LOG.warning("Deleting " + this.outputFile);
this.outputFile.delete();
if (tbi.exists() && tbi.isFile())
tbi.delete();
}
throw new IOException(e);
} finally {
CloserUtil.close(iter);
CloserUtil.close(sorter);
CloserUtil.close(writer);
}
}
Aggregations