use of htsjdk.samtools.util.BlockCompressedInputStream in project gatk by broadinstitute.
the class GatherVcfs method gatherWithBlockCopying.
/**
* Assumes that all inputs and outputs are block compressed VCF files and copies them without decompressing and parsing
* most of the gzip blocks. Will decompress and parse blocks up to the one containing the end of the header in each file
* (often the first block) and re-compress any data remaining in that block into a new block in the output file. Subsequent
* blocks (excluding a terminator block if present) are copied directly from input to output.
*/
private static void gatherWithBlockCopying(final List<File> vcfs, final File output) {
try (final FileOutputStream out = new FileOutputStream(output)) {
boolean isFirstFile = true;
for (final File f : vcfs) {
log.info("Gathering " + f.getAbsolutePath());
try (final FileInputStream in = new FileInputStream(f)) {
// a) It's good to check that the end of the file is valid and b) we need to know if there's a terminator block and not copy it
final BlockCompressedInputStream.FileTermination term = BlockCompressedInputStream.checkTermination(f);
if (term == BlockCompressedInputStream.FileTermination.DEFECTIVE)
throw new UserException.MalformedFile(f.getAbsolutePath() + " does not have a valid GZIP block at the end of the file.");
if (!isFirstFile) {
final BlockCompressedInputStream blockIn = new BlockCompressedInputStream(in, false);
boolean lastByteNewline = true;
while (in.available() > 0) {
// Read a block - blockIn.available() is guaranteed to return the bytes remaining in the block that has been
// read, and since we haven't consumed any yet, that is the block size.
final int blockLength = blockIn.available();
final byte[] blockContents = new byte[blockLength];
final int read = blockIn.read(blockContents);
Utils.validate(blockLength > 0 && read == blockLength, "Could not read available bytes from BlockCompressedInputStream.");
// Scan forward within the block to see if we can find the end of the header within this block
int firstNonHeaderByteIndex = -1;
for (int i = 0; i < read; ++i) {
final byte b = blockContents[i];
final boolean thisByteNewline = (b == '\n' || b == '\r');
if (lastByteNewline && !thisByteNewline && b != '#') {
// Aha! Found first byte of non-header data in file!
firstNonHeaderByteIndex = i;
break;
}
lastByteNewline = thisByteNewline;
}
// new gzip block and then break out of the while loop
if (firstNonHeaderByteIndex >= 0) {
final BlockCompressedOutputStream blockOut = new BlockCompressedOutputStream(out, null);
blockOut.write(blockContents, firstNonHeaderByteIndex, blockContents.length - firstNonHeaderByteIndex);
blockOut.flush();
// Don't close blockOut because closing underlying stream would break everything
break;
}
}
}
// Copy remainder of input stream into output stream
final long currentPos = in.getChannel().position();
final long length = f.length();
final long skipLast = (term == BlockCompressedInputStream.FileTermination.HAS_TERMINATOR_BLOCK) ? BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK.length : 0;
final long bytesToWrite = length - skipLast - currentPos;
IOUtil.transferByStream(in, out, bytesToWrite);
isFirstFile = false;
}
}
// And lastly add the Terminator block and close up
out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
}
}
use of htsjdk.samtools.util.BlockCompressedInputStream in project ASCIIGenome by dariober.
the class IOUtils method tryBGZIP.
private static InputStream tryBGZIP(InputStream in) throws IOException {
byte[] buffer = new byte[2048];
PushbackInputStream push_back = new PushbackInputStream(in, buffer.length + 10);
int nReads = push_back.read(buffer);
push_back.unread(buffer, 0, nReads);
try {
BlockCompressedInputStream bgz = new BlockCompressedInputStream(new ByteArrayInputStream(buffer, 0, nReads));
bgz.read();
bgz.close();
return new BlockCompressedInputStream(push_back);
} catch (Exception err) {
return new GZIPInputStream(push_back);
}
}
use of htsjdk.samtools.util.BlockCompressedInputStream in project jvarkit by lindenb.
the class IOUtils method tryBGZIP.
private static InputStream tryBGZIP(final InputStream in) throws IOException {
final byte[] buffer = new byte[BlockCompressedStreamConstants.GZIP_BLOCK_PREAMBLE.length];
final PushbackInputStream push_back = new PushbackInputStream(in, buffer.length + 10);
int nReads = push_back.read(buffer);
push_back.unread(buffer, 0, nReads);
try {
if (nReads >= buffer.length && buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && buffer[1] == (byte) BlockCompressedStreamConstants.GZIP_ID2 && buffer[2] == BlockCompressedStreamConstants.GZIP_CM_DEFLATE && buffer[3] == BlockCompressedStreamConstants.GZIP_FLG && buffer[8] == BlockCompressedStreamConstants.GZIP_XFL) {
return new BlockCompressedInputStream(push_back);
}
} catch (final Exception err) {
// not bzip
}
return new GZIPInputStream(push_back);
}
use of htsjdk.samtools.util.BlockCompressedInputStream in project jvarkit by lindenb.
the class VcfRegistryCGI method doWork.
private void doWork(XMLStreamWriter w, final GroupFile gf) throws XMLStreamException {
Position pos = parsePosition();
if (pos == null)
return;
w.writeStartElement("div");
w.writeStartElement("h2");
w.writeCharacters(pos.chrom + ":" + pos.pos);
w.writeEndElement();
w.writeStartElement("table");
w.writeStartElement("thead");
w.writeStartElement("tr");
for (String header : new String[] { "CHROM", "POS", "ID", "REF", "QUAL", "Sample", "Alleles", "DP", "GQ", "File" }) {
w.writeStartElement("th");
w.writeCharacters(header);
// td
w.writeEndElement();
}
// tr
w.writeEndElement();
// thead
w.writeEndElement();
w.writeStartElement("tbody");
Set<String> samplesWithGenotypes = new HashSet<String>();
Set<String> allSamples = new HashSet<String>();
for (VcfFile f : getVcfFiles(gf)) {
TabixReader tabixReader = null;
TabixReader.Iterator iter = null;
BlockCompressedInputStream bgzin = null;
VCFHeader header = null;
AbstractVCFCodec vcfCodec = VCFUtils.createDefaultVCFCodec();
LineIterator lineIterator = null;
for (int i = 0; i < 2; i++) {
try {
if (i == 0) {
bgzin = new BlockCompressedInputStream(f.file);
lineIterator = new LineIteratorImpl(new SynchronousLineReader(bgzin));
header = (VCFHeader) vcfCodec.readActualHeader(lineIterator);
allSamples.addAll(header.getGenotypeSamples());
} else {
tabixReader = new TabixReader(f.file.getPath());
String line;
int[] x = tabixReader.parseReg(pos.chrom + ":" + pos.pos + "-" + (pos.pos));
if (x != null && x.length > 2 && x[0] != -1) {
iter = tabixReader.query(x[0], x[1], x[2]);
} else {
}
while (iter != null && (line = iter.next()) != null) {
VariantContext var = vcfCodec.decode(line);
for (String sample : header.getSampleNamesInOrder()) {
final Genotype genotype = var.getGenotype(sample);
if (genotype == null || !genotype.isCalled())
continue;
if (!genotype.isAvailable())
continue;
samplesWithGenotypes.add(sample);
w.writeStartElement("tr");
w.writeStartElement("td");
w.writeCharacters(var.getContig());
w.writeEndElement();
w.writeStartElement("td");
w.writeCharacters(String.valueOf(var.getStart()));
w.writeEndElement();
if (var.hasID()) {
w.writeStartElement("td");
if (var.getID().matches("rs[0-9]+")) {
w.writeStartElement("a");
w.writeAttribute("href", "http://www.ncbi.nlm.nih.gov/snp/" + var.getID().substring(2));
w.writeCharacters(var.getID());
// a
w.writeEndElement();
} else {
w.writeCharacters(var.getID());
}
// td
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
if (var.getReference() != null) {
w.writeStartElement("td");
w.writeCharacters(var.getReference().getBaseString());
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
if (var.hasLog10PError()) {
w.writeStartElement("td");
w.writeCharacters(String.valueOf((int) var.getPhredScaledQual()));
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
w.writeStartElement("td");
w.writeCharacters(sample);
w.writeEndElement();
List<Allele> alleles = genotype.getAlleles();
w.writeStartElement("td");
w.writeStartElement("span");
if (genotype.isHomRef()) {
w.writeAttribute("style", "color:green;");
} else if (genotype.isHomVar()) {
w.writeAttribute("style", "color:red;");
} else if (genotype.isHet()) {
w.writeAttribute("style", "color:blue;");
}
for (int j = 0; j < alleles.size(); ++j) {
if (j > 0)
w.writeCharacters(genotype.isPhased() ? "|" : "/");
w.writeCharacters(alleles.get(j).getBaseString());
}
// span
w.writeEndElement();
w.writeEndElement();
if (genotype.hasDP()) {
w.writeStartElement("td");
w.writeCharacters(String.valueOf(genotype.getDP()));
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
if (genotype.hasGQ()) {
w.writeStartElement("td");
w.writeCharacters(String.valueOf(genotype.getGQ()));
w.writeEndElement();
} else {
w.writeEmptyElement("td");
}
w.writeStartElement("td");
w.writeCharacters(f.file.getName());
w.writeEndElement();
// tr
w.writeEndElement();
w.flush();
}
}
}
} catch (Exception err) {
w.writeComment("BOUM " + err);
header = null;
lastException = err;
} finally {
CloserUtil.close(lineIterator);
CloserUtil.close(bgzin);
CloserUtil.close(tabixReader);
CloserUtil.close(iter);
}
if (i == 0 && header == null)
break;
}
w.flush();
}
// tbody
w.writeEndElement();
// table
w.writeEndElement();
allSamples.removeAll(samplesWithGenotypes);
if (!allSamples.isEmpty()) {
w.writeStartElement("h3");
w.writeCharacters("Samples not found");
w.writeEndElement();
w.writeStartElement("ol");
for (String sample : new TreeSet<String>(allSamples)) {
w.writeStartElement("li");
w.writeCharacters(sample);
w.writeEndElement();
}
w.writeEndElement();
}
writeHTMLException(w, this.lastException);
// div
w.writeEndElement();
}
use of htsjdk.samtools.util.BlockCompressedInputStream in project jvarkit by lindenb.
the class VcfOffsetsIndexFactory method indexVcfFile.
/**
* index a vcf file for its variant offsets
*/
public File indexVcfFile(final File vcfFile, final File indexFile) throws IOException {
LOG.info("indexing " + vcfFile);
IOUtil.assertFileIsReadable(vcfFile);
DataOutputStream daos = null;
BlockCompressedInputStream bgzin = null;
AsciiLineReader ascii = null;
VCFHeader header = null;
final VCFCodec codec = new VCFCodec();
SAMSequenceDictionaryProgress progress = null;
try {
daos = new DataOutputStream(new FileOutputStream(indexFile));
daos.write(MAGIC);
if (vcfFile.getName().endsWith(".vcf.gz")) {
bgzin = new BlockCompressedInputStream(vcfFile);
ascii = null;
} else if (vcfFile.getName().endsWith(".vcf")) {
bgzin = null;
ascii = new AsciiLineReader(new FileInputStream(vcfFile));
} else {
throw new IllegalArgumentException("not a vcf.gz or vcf file: " + vcfFile);
}
final List<String> headerLines = new ArrayList<>();
for (; ; ) {
final long offset = (ascii == null ? bgzin.getPosition() : ascii.getPosition());
final String line = (ascii == null ? bgzin.readLine() : ascii.readLine());
if (line == null)
break;
if (line.startsWith("#")) {
headerLines.add(line);
if (line.startsWith("#CHROM")) {
codec.readHeader(new LineIterator() {
int i = 0;
@Override
public String next() {
final String s = headerLines.get(i);
i++;
return s;
}
@Override
public boolean hasNext() {
return i < headerLines.size();
}
@Override
public String peek() {
return i < headerLines.size() ? headerLines.get(i) : null;
}
});
header = VCFUtils.parseHeader(headerLines).header;
progress = new SAMSequenceDictionaryProgress(header);
progress.logger(this.logger == null ? LOG : this.logger);
progress.setLogPrefix("indexing");
}
continue;
}
if (progress == null) {
throw new JvarkitException.FileFormatError("no vcf header in " + vcfFile);
}
final VariantContext ctx = codec.decode(line);
progress.watch(ctx);
if (this.acceptVariant != null) {
if (!acceptVariant.test(ctx))
continue;
}
daos.writeLong(offset);
}
if (progress == null) {
throw new JvarkitException.FileFormatError("no vcf header in " + vcfFile);
}
progress.finish();
daos.flush();
daos.close();
return indexFile;
} catch (final IOException err) {
throw err;
} finally {
CloserUtil.close(ascii);
CloserUtil.close(bgzin);
CloserUtil.close(daos);
}
}
Aggregations