use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class ReferenceToVCF method doWork.
@Override
public int doWork(List<String> args) {
if (this.bedFile != null) {
if (limitBed == null)
limitBed = new IntervalTreeMap<Boolean>();
try {
Pattern tab = Pattern.compile("[\t]");
BufferedReader r = IOUtils.openFileForBufferedReading(this.bedFile);
String line;
while ((line = r.readLine()) != null) {
if (BedLine.isBedHeader(line))
continue;
if (line.startsWith("#") || line.isEmpty())
continue;
String[] tokens = tab.split(line, 4);
limitBed.put(new Interval(tokens[0], 1 + Integer.parseInt(tokens[1]), 1 + Integer.parseInt(tokens[2])), true);
}
CloserUtil.close(r);
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
Random random = new Random(0L);
VariantContextWriter out = null;
try {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(oneAndOnlyOneFile(args)));
SAMSequenceDictionary dict = fasta.getSequenceDictionary();
out = super.openVariantContextWriter(this.outputFile);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
VCFHeader header = new VCFHeader();
header.setSequenceDictionary(dict);
out.writeHeader(header);
final List<List<Allele>> combination = new ArrayList<List<Allele>>(4);
// always keep REF as first allele please
combination.add(Arrays.asList(Allele.create("A", true), Allele.create("C", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("C", true), Allele.create("A", false), Allele.create("G", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("G", true), Allele.create("A", false), Allele.create("C", false), Allele.create("T", false)));
combination.add(Arrays.asList(Allele.create("T", true), Allele.create("A", false), Allele.create("C", false), Allele.create("G", false)));
for (SAMSequenceRecord ssr : dict.getSequences()) {
GenomicSequence genome = new GenomicSequence(fasta, ssr.getSequenceName());
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), 1, genome.length());
if (!this.limitBed.containsOverlapping(interval))
continue;
}
for (int n = 0; n < genome.length(); ++n) {
progress.watch(ssr.getSequenceIndex(), n);
List<Allele> alleles = null;
byte ref = (byte) genome.charAt(n);
switch(ref) {
case 'a':
case 'A':
alleles = combination.get(0);
break;
case 'c':
case 'C':
alleles = combination.get(1);
break;
case 'g':
case 'G':
alleles = combination.get(2);
break;
case 't':
case 'T':
alleles = combination.get(3);
break;
default:
break;
}
if (alleles == null)
continue;
if (this.limitBed != null) {
Interval interval = new Interval(ssr.getSequenceName(), n + 1, n + 1);
if (!this.limitBed.containsOverlapping(interval))
continue;
}
if (!disjoint_alts) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
vcb.alleles(alleles);
out.add(vcb.make());
} else {
for (// index 0 is always REF
int a = 1; // index 0 is always REF
a < 4; // index 0 is always REF
++a) {
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.stop(n + 1);
// index 0 is always REF
vcb.alleles(Arrays.asList(alleles.get(0), alleles.get(a)));
out.add(vcb.make());
}
}
if (insertion_size > 0 && n + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REFERENCE
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(n + 1), true));
StringBuilder sb = new StringBuilder(insertion_size + 2);
sb.append(genome.charAt(n));
for (int n2 = 0; n2 < insertion_size; ++n2) {
switch(random.nextInt(4)) {
case 0:
sb.append('A');
break;
case 1:
sb.append('C');
break;
case 2:
sb.append('G');
break;
case 3:
sb.append('T');
break;
}
}
sb.append(genome.charAt(n + 1));
alleles.add(Allele.create(sb.toString(), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (deletion_size > 0 && n + deletion_size + 1 < genome.length()) {
alleles = new ArrayList<Allele>(2);
// REF
StringBuilder sb = new StringBuilder(deletion_size + 2);
sb.append(genome.charAt(n));
int lastpos = n + 1;
for (int n2 = 0; n2 < deletion_size; ++n2, lastpos++) {
sb.append(genome.charAt(lastpos));
}
sb.append(genome.charAt(lastpos));
alleles.add(Allele.create(sb.toString(), true));
alleles.add(Allele.create("" + genome.charAt(n) + genome.charAt(lastpos), false));
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getSequenceName());
vcb.start(n + 1);
vcb.alleles(alleles);
vcb.computeEndFromAlleles(alleles, n + 1);
out.add(vcb.make());
}
if (out.checkError())
break;
}
if (out.checkError())
break;
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VCFCompareGT method doWork.
@Override
public int doWork(final List<String> arguments) {
final List<File> inputVcfFiles = new ArrayList<>(IOUtil.unrollFiles(arguments.stream().map(F -> new File(F)).collect(Collectors.toCollection(HashSet::new)), ".vcf", "vcf.gz"));
if (inputVcfFiles.isEmpty()) {
LOG.error("VCF missing.");
return -1;
}
VariantComparator varcmp = new VariantComparator();
SortingCollection<Variant> variants = null;
final Set<String> sampleNames = new LinkedHashSet<>();
try {
variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
variants.setDestructiveIteration(true);
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
for (int i = 0; i < inputVcfFiles.size(); ++i) {
final File vcfFile = inputVcfFiles.get(i);
LOG.info("Opening " + vcfFile);
final VCFFileReader vcfFileReader = new VCFFileReader(vcfFile, false);
final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
final VCFHeader header = vcfFileReader.getFileHeader();
sampleNames.addAll(header.getSampleNamesInOrder());
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + ((i) + 1), "File: " + vcfFile.getPath()));
long nLines = 0;
while (iter.hasNext()) {
final VariantContext var = iter.next();
if (nLines++ % 10000 == 0) {
LOG.info(vcfFile + " " + nLines);
}
if (!var.isVariant())
continue;
if (!var.hasGenotypes())
continue;
for (final Genotype genotype : var.getGenotypes()) {
final Variant rec = new Variant();
if (!genotype.isAvailable())
continue;
if (!genotype.isCalled())
continue;
if (genotype.isNoCall())
continue;
rec.file_index = i + 1;
rec.sampleName = genotype.getSampleName();
rec.chrom = var.getContig();
rec.start = var.getStart();
rec.end = var.getEnd();
rec.ref = var.getReference().getDisplayString();
if (var.hasID()) {
rec.id = var.getID();
}
if (genotype.hasDP()) {
rec.dp = genotype.getDP();
}
if (genotype.hasGQ()) {
rec.gq = genotype.getGQ();
}
final List<Allele> alleles = genotype.getAlleles();
if (alleles == null)
continue;
if (alleles.size() == 1) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = rec.a1;
} else if (alleles.size() == 2) {
rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
if (rec.a1.compareTo(rec.a2) > 0) {
String tmp = rec.a2;
rec.a2 = rec.a1;
rec.a1 = tmp;
}
} else {
continue;
}
variants.add(rec);
}
}
iter.close();
vcfFileReader.close();
}
variants.doneAdding();
LOG.info("Done Adding");
final Set<String> newSampleNames = new HashSet<>();
for (int i = 0; i < inputVcfFiles.size(); ++i) {
for (final String sample : sampleNames) {
newSampleNames.add(sample + "_" + ((i) + 1));
}
}
final String GenpotypeChangedKey = "GCH";
final String GenpotypeCreated = "GNW";
final String GenpotypeDiff = "GDF";
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Qual"));
metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
final VCFHeader header = new VCFHeader(metaData, new ArrayList<String>(newSampleNames));
final VariantContextWriter w = super.openVariantContextWriter(outputFile);
w.writeHeader(header);
final PosComparator posCompare = new PosComparator();
final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
while (iter.hasNext()) {
final List<Variant> row = iter.next();
/**
* this sample is not always the same
*/
final Set<String> samplesModified = new TreeSet<>();
/**
* the number of sample is different from vcflist.size()
*/
final Set<String> samplesCreates = new TreeSet<>();
final Counter<String> samplesSeen = new Counter<>();
for (int x = 0; x < row.size(); ++x) {
final Variant var1 = row.get(x);
samplesSeen.incr(var1.sampleName);
for (int y = x + 1; y < row.size(); ++y) {
final Variant var2 = row.get(y);
if (!var2.sampleName.equals(var1.sampleName))
continue;
if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
continue;
samplesModified.add(var1.sampleName);
}
}
for (final String sampleName : samplesSeen.keySet()) {
if (samplesSeen.count(sampleName) != inputVcfFiles.size()) {
samplesCreates.add(sampleName);
}
}
final Variant first = row.get(0);
final Set<Allele> alleles = new HashSet<>();
alleles.add(Allele.create(first.ref, true));
for (final Variant var : row) {
alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
}
final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
// build genotypes
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final Variant var : row) {
// alleles for this genotype
final List<Allele> galleles = new ArrayList<Allele>();
galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
final GenotypeBuilder gb = new GenotypeBuilder();
gb.DP(var.dp);
gb.alleles(galleles);
gb.name(var.sampleName + "_" + var.file_index);
gb.GQ(var.gq);
gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
genotypes.add(gb.make());
}
b.genotypes(genotypes);
b.id(first.id);
if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
Set<String> set2 = new TreeSet<String>(samplesModified);
set2.addAll(samplesCreates);
b.attribute(GenpotypeDiff, set2.toArray());
}
if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
w.add(b.make());
}
}
iter.close();
w.close();
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (variants != null)
try {
variants.cleanup();
} catch (Exception err) {
}
}
return 0;
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VcfIn method scanFileSorted.
private int scanFileSorted(final VariantContextWriter vcw, final String databaseVcfUri, final VcfIterator userVcfIn) {
EqualRangeVcfIterator equalRangeDbIter = null;
EqualRangeIterator<VariantContext> equalRangeUserVcf = null;
try {
final VCFHeader header = new VCFHeader(userVcfIn.getHeader());
final SAMSequenceDictionary userVcfDict = header.getSequenceDictionary();
// / NO need if(dict1==null)
if (userVcfDict == null) {
LOG.error(JvarkitException.VcfDictionaryMissing.getMessage("user file"));
return -1;
}
final Comparator<VariantContext> userVcfComparator = VCFUtils.createTidPosComparator(userVcfDict);
equalRangeDbIter = new EqualRangeVcfIterator(VCFUtils.createVcfIterator(databaseVcfUri), userVcfComparator);
this.addMetaData(header);
vcw.writeHeader(header);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(userVcfDict).logger(LOG);
equalRangeUserVcf = new EqualRangeIterator<>(userVcfIn, userVcfComparator);
while (equalRangeUserVcf.hasNext()) {
final List<VariantContext> ctxList = equalRangeUserVcf.next();
progress.watch(ctxList.get(0));
// fill both contextes
final List<VariantContext> dbContexes = new ArrayList<VariantContext>(equalRangeDbIter.next(ctxList.get(0)));
for (final VariantContext userCtx : ctxList) {
boolean keep = dbContexes.stream().filter(V -> sameContext(userCtx, V)).anyMatch(V -> allUserAltFoundInDatabase(userCtx, V));
addVariant(vcw, userCtx, keep);
}
if (vcw.checkError())
break;
}
equalRangeUserVcf.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(equalRangeDbIter);
CloserUtil.close(userVcfIn);
CloserUtil.close(vcw);
}
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VcfConcat method doWork.
@Override
public int doWork(List<String> args) {
VariantContextWriter w = null;
BufferedReader r = null;
try {
if (args.isEmpty()) {
LOG.info("Reading filenames from stdin");
r = IOUtils.openStdinForBufferedReader();
this.inputFiles.addAll(r.lines().filter(line -> !(line.trim().isEmpty() || line.startsWith("#"))).collect(Collectors.toSet()));
r.close();
r = null;
} else {
for (final String filename : args) {
this.inputFiles.addAll(IOUtils.unrollFiles(Arrays.asList(filename)));
}
}
if (this.inputFiles.isEmpty()) {
LOG.error("No input");
return -1;
}
w = super.openVariantContextWriter(this.outputfile);
return fromFiles(w);
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(r);
}
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VcfToZip method doWork.
@Override
public int doWork(List<String> args) {
if (this.outputFile != null && this.outputFile.getName().endsWith(".zip")) {
LOG.error("Filename must end with '.zip' " + outputFile);
return -1;
}
LineIterator lr = null;
VcfIterator in = null;
ZipOutputStream zout = null;
FileOutputStream fout = null;
int num_vcfs = 0;
VariantContextWriter vcw = null;
args = new ArrayList<>(IOUtils.unrollFiles(args));
try {
int optind = 0;
if (this.outputFile != null) {
fout = new FileOutputStream(this.outputFile);
zout = new ZipOutputStream(fout);
} else {
zout = new ZipOutputStream(stdout());
}
do {
if (args.isEmpty()) {
LOG.info("reading concatenated vcf from stdin");
lr = IOUtils.openStreamForLineIterator(stdin());
} else {
LOG.info("reading concatenated vcf from " + args.get(optind));
lr = IOUtils.openURIForLineIterator(args.get(optind));
}
while (lr.hasNext()) {
++num_vcfs;
in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
final VCFHeader header = in.getHeader();
String filename = null;
if (this.titleHeaderStr != null && !this.titleHeaderStr.isEmpty()) {
final VCFHeaderLine h = header.getOtherHeaderLine(this.titleHeaderStr);
if (h != null && !h.getValue().trim().isEmpty())
filename = h.getValue().trim();
}
if (filename == null || filename.trim().isEmpty()) {
// create title
filename = String.format("vcf2zip.%05d.vcf", num_vcfs);
// set name in header
if (this.titleHeaderStr != null && !this.titleHeaderStr.isEmpty()) {
header.addMetaDataLine(new VCFHeaderLine(this.titleHeaderStr.trim(), filename));
}
}
if (!filename.endsWith(".vcf")) {
filename += ".vcf";
}
if (!this.zipPrefix.isEmpty()) {
filename = this.zipPrefix + (this.zipPrefix.endsWith("/") ? "" : "/") + filename;
}
LOG.info(filename);
final ZipEntry entry = new ZipEntry(filename);
entry.setComment("Created with " + getProgramName());
zout.putNextEntry(entry);
vcw = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
vcw.writeHeader(header);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (in.hasNext()) {
vcw.add(progress.watch(in.next()));
}
vcw.close();
progress.finish();
zout.closeEntry();
in.close();
}
++optind;
} while (optind < args.size());
zout.finish();
zout.flush();
zout.close();
zout = null;
CloserUtil.close(fout);
LOG.info("done. Number of VCFs:" + num_vcfs);
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(lr);
CloserUtil.close(zout);
CloserUtil.close(fout);
}
}
Aggregations