use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class IndexCovToVcf method doWork.
@Override
public int doWork(final List<String> args) {
if (this.deletionTreshold >= 1.0f) {
LOG.error("Bad deletion treshold >=1.0");
return -1;
}
if (this.duplicationTreshold <= 1.0f) {
LOG.error("Bad duplication treshold <=1.0");
return -1;
}
if (this.deletionTreshold >= this.duplicationTreshold) {
LOG.error("Bad tresholds del>=dup");
return -1;
}
final Pattern tab = Pattern.compile("[\t]");
BufferedReader r = null;
VariantContextWriter vcw = null;
try {
final SAMSequenceDictionary dict;
if (this.refFile == null) {
dict = null;
} else {
dict = SAMSequenceDictionaryExtractor.extractDictionary(this.refFile);
if (dict == null) {
LOG.error("Cannot find dict in " + this.refFile);
return -1;
}
}
r = super.openBufferedReader(oneFileOrNull(args));
String line = r.readLine();
if (line == null) {
LOG.error("Cannot read first line of input");
return -1;
}
String[] tokens = tab.split(line);
if (tokens.length < 4 || !tokens[0].equals("#chrom") || !tokens[1].equals("start") || !tokens[2].equals("end")) {
LOG.error("bad first line " + line);
return -1;
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT");
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "END");
final VCFFormatHeaderLine foldHeader = new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Relative number of copy: 0.5 deletion 1 normal 2.0 duplication");
metaData.add(foldHeader);
final VCFFormatHeaderLine formatIsDeletion = new VCFFormatHeaderLine("DEL", 1, VCFHeaderLineType.Integer, "set to 1 if relative number of copy <= " + this.deletionTreshold);
metaData.add(formatIsDeletion);
final VCFFormatHeaderLine formatIsDuplication = new VCFFormatHeaderLine("DUP", 1, VCFHeaderLineType.Integer, "set to 1 if relative number of copy >= " + this.duplicationTreshold);
metaData.add(formatIsDuplication);
final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
metaData.add(filterAllDel);
final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples greater than 1 and all are duplication");
metaData.add(filterAllDup);
final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
metaData.add(filterNoSV);
final VCFInfoHeaderLine infoNumDup = new VCFInfoHeaderLine("NDUP", 1, VCFHeaderLineType.Integer, "Number of samples being duplicated");
metaData.add(infoNumDup);
final VCFInfoHeaderLine infoNumDel = new VCFInfoHeaderLine("NDEL", 1, VCFHeaderLineType.Integer, "Number of samples being deleted");
metaData.add(infoNumDel);
final List<String> samples = Arrays.asList(tokens).subList(3, tokens.length);
final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
if (dict != null) {
vcfHeader.setSequenceDictionary(dict);
}
vcw = super.openVariantContextWriter(outputFile);
vcw.writeHeader(vcfHeader);
// final List<Allele> NO_CALL_NO_CALL = Arrays.asList(Allele.NO_CALL,Allele.NO_CALL);
final Allele DUP_ALLELE = Allele.create("<DUP>", false);
final Allele DEL_ALLELE = Allele.create("<DEL>", false);
final Allele REF_ALLELE = Allele.create("N", true);
while ((line = r.readLine()) != null) {
if (StringUtil.isBlank(line))
continue;
tokens = tab.split(line);
if (tokens.length != 3 + samples.size()) {
throw new JvarkitException.TokenErrors("expected " + (samples.size() + 3) + "columns.", tokens);
}
final Set<Allele> alleles = new HashSet<>();
alleles.add(REF_ALLELE);
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(tokens[0]);
vcb.start(Integer.parseInt(tokens[1]));
final int chromEnd = Integer.parseInt(tokens[2]);
vcb.stop(chromEnd);
vcb.attribute(VCFConstants.END_KEY, chromEnd);
if (dict != null) {
final SAMSequenceRecord ssr = dict.getSequence(tokens[0]);
if (ssr == null) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(tokens[0], dict));
return -1;
}
if (chromEnd > ssr.getSequenceLength()) {
LOG.warn("WARNING sequence length in " + line + " is greater than in dictionary ");
}
}
int count_dup = 0;
int count_del = 0;
final List<Genotype> genotypes = new ArrayList<>(samples.size());
for (int i = 3; i < tokens.length; i++) {
final float f = Float.parseFloat(tokens[i]);
if (f < 0 || Float.isNaN(f) || !Float.isFinite(f)) {
LOG.error("Bad fold " + f + " in " + line);
}
final GenotypeBuilder gb;
if (f <= this.deletionTreshold) {
gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(DEL_ALLELE));
alleles.add(DEL_ALLELE);
gb.attribute(formatIsDeletion.getID(), 1);
count_del++;
} else if (f >= this.duplicationTreshold) {
gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(DUP_ALLELE));
alleles.add(DUP_ALLELE);
gb.attribute(formatIsDuplication.getID(), 1);
count_dup++;
} else {
gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(REF_ALLELE));
gb.attribute(formatIsDuplication.getID(), 0);
}
gb.attribute(foldHeader.getID(), f);
genotypes.add(gb.make());
}
vcb.alleles(alleles);
vcb.genotypes(genotypes);
if (count_dup == samples.size() && samples.size() != 1) {
vcb.filter(filterAllDup.getID());
}
if (count_del == samples.size() && samples.size() != 1) {
vcb.filter(filterAllDel.getID());
}
if (count_dup == 0 && count_del == 0) {
vcb.filter(filterNoSV.getID());
}
vcb.attribute(infoNumDel.getID(), count_del);
vcb.attribute(infoNumDup.getID(), count_dup);
vcw.add(vcb.make());
}
vcw.close();
vcw = null;
r.close();
r = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(vcw);
}
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
/*private static boolean closeTo(int pos1,int pos2, int max)
{
return Math.abs(pos2-pos1)<=max;
}*/
/*
private static boolean same(char c1,char c2)
{
if(c1=='N' || c2=='N') return false;
return Character.toUpperCase(c1)==Character.toUpperCase(c2);
}*/
@Override
public int doWork(List<String> args) {
int readLength = 150;
if (args.isEmpty()) {
LOG.error("illegal.number.of.arguments");
return -1;
}
List<Input> inputs = new ArrayList<Input>();
VariantContextWriter w = null;
// SAMFileWriter w=null;
try {
SAMSequenceDictionary dict = null;
/* create input, collect sample names */
Map<String, Input> sample2input = new HashMap<String, Input>();
for (final String filename : args) {
Input input = new Input(new File(filename));
// input.index=inputs.size();
inputs.add(input);
if (sample2input.containsKey(input.sampleName)) {
LOG.error("Duplicate sample " + input.sampleName + " in " + input.bamFile + " and " + sample2input.get(input.sampleName).bamFile);
return -1;
}
sample2input.put(input.sampleName, input);
if (dict == null) {
dict = input.header.getSequenceDictionary();
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, input.header.getSequenceDictionary())) {
LOG.error("Found more than one dictint sequence dictionary");
return -1;
}
}
LOG.info("Sample N= " + sample2input.size());
/* create merged iterator */
List<SAMFileHeader> headers = new ArrayList<SAMFileHeader>(sample2input.size());
for (Input input : inputs) headers.add(input.header);
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SortOrder.coordinate, headers, true);
List<SamReader> readers = new ArrayList<SamReader>(sample2input.size());
for (Input input : inputs) readers.add(input.samFileReaderScan);
MergingSamRecordIterator merginIter = new MergingSamRecordIterator(headerMerger, readers, true);
Allele reference_allele = Allele.create("N", true);
Allele[] alternate_alleles = new Allele[] { Allele.create("<CLIP5>", false), Allele.create("<CLIP3>", false) };
Set<VCFHeaderLine> vcfHeaderLines = new HashSet<VCFHeaderLine>();
for (Allele alt : alternate_alleles) {
vcfHeaderLines.add(new VCFSimpleHeaderLine("<ID=" + alt.getDisplayString() + ",Description=\"StructVar\">", VCFHeaderVersion.VCF4_1, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")));
}
vcfHeaderLines.add(new VCFInfoHeaderLine("COUNT_SAMPLES", 1, VCFHeaderLineType.Integer, "Number of samples with depth>=" + this.min_depth));
vcfHeaderLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth."));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype"));
vcfHeaderLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth"));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
vcfHeaderLines.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
for (int side = 0; side < 2; ++side) {
vcfHeaderLines.add(new VCFFormatHeaderLine("CN" + (side == 0 ? 5 : 3), 1, VCFHeaderLineType.Integer, "count clipped in " + (side == 0 ? 5 : 3) + "'"));
}
if (dict != null) {
vcfHeaderLines.addAll(VCFUtils.samSequenceDictToVCFContigHeaderLine(dict));
}
VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample2input.keySet());
w = VCFUtils.createVariantContextWriterToStdout();
w.writeHeader(vcfHeader);
final IntervalTreeMap<Boolean> intervals = new IntervalTreeMap<>();
// w=swf.make(header, System.out);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
if (bedFile != null) {
final BedLineCodec bedLineCodec = new BedLineCodec();
LOG.info("Reading " + bedFile);
BufferedReader r = IOUtils.openFileForBufferedReading(bedFile);
String line;
while ((line = r.readLine()) != null) {
BedLine bedLine = bedLineCodec.decode(line);
if (bedLine == null)
continue;
if (dict != null && dict.getSequence(bedLine.getContig()) == null) {
LOG.warning("undefined chromosome in " + bedFile + " " + line);
continue;
}
intervals.put(bedLine.toInterval(), true);
}
CloserUtil.close(r);
}
LinkedList<SAMRecord> buffer = new LinkedList<SAMRecord>();
final Predicate<SAMRecord> filterSamRecords = new Predicate<SAMRecord>() {
@Override
public boolean test(SAMRecord rec) {
if (rec.getReadUnmappedFlag())
return false;
if (rec.isSecondaryOrSupplementary())
return false;
if (rec.getDuplicateReadFlag())
return false;
if (rec.getReadFailsVendorQualityCheckFlag())
return false;
Cigar cigar = rec.getCigar();
if (cigar == null || cigar.numCigarElements() < 2)
return false;
boolean found_S = false;
for (int side = 0; side < 2; ++side) {
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
// read must be clipped on 5' or 3' with a good length
if (!ce.getOperator().equals(CigarOperator.S))
continue;
found_S = true;
break;
}
if (!found_S)
return false;
SAMReadGroupRecord g = rec.getReadGroup();
if (g == null || g.getSample() == null || g.getSample().isEmpty())
return false;
return true;
}
};
final FilteringIterator<SAMRecord> forwardIterator = new FilteringIterator<SAMRecord>(merginIter, filterSamRecords);
for (; ; ) {
SAMRecord rec = null;
if (forwardIterator.hasNext()) {
rec = forwardIterator.next();
progress.watch(rec);
if (intervals != null && !intervals.containsOverlapping(new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd())))
continue;
}
// need to flush buffer ?
if (rec == null || (!buffer.isEmpty() && !buffer.getLast().getReferenceIndex().equals(rec.getReferenceIndex())) || (!buffer.isEmpty() && buffer.getLast().getUnclippedEnd() + readLength < rec.getUnclippedStart())) {
if (!buffer.isEmpty()) {
int chromStart = buffer.getFirst().getUnclippedStart();
int chromEnd = buffer.getFirst().getUnclippedEnd();
for (SAMRecord sam : buffer) {
chromStart = Math.min(chromStart, sam.getUnclippedStart());
chromEnd = Math.max(chromEnd, sam.getUnclippedEnd());
}
final int winShift = 5;
for (int pos = chromStart; pos + winShift <= chromEnd; pos += winShift) {
int[] count_big_clip = new int[] { 0, 0 };
// int max_depth[]=new int[]{0,0};
List<Genotype> genotypes = new ArrayList<Genotype>();
Set<Allele> all_alleles = new HashSet<Allele>();
all_alleles.add(reference_allele);
boolean found_one_depth_ok = false;
int sum_depth = 0;
int samples_with_high_depth = 0;
for (String sample : sample2input.keySet()) {
GenotypeBuilder gb = new GenotypeBuilder(sample);
int[] count_clipped = new int[] { 0, 0 };
Set<Allele> sample_alleles = new HashSet<Allele>(3);
for (int side = 0; side < 2; ++side) {
for (SAMRecord sam : buffer) {
if (!sam.getReadGroup().getSample().equals(sample))
continue;
Cigar cigar = sam.getCigar();
CigarElement ce = cigar.getCigarElement(side == 0 ? 0 : cigar.numCigarElements() - 1);
if (!ce.getOperator().equals(CigarOperator.S))
continue;
int clipStart = (side == 0 ? sam.getUnclippedStart() : sam.getAlignmentEnd() + 1);
int clipEnd = (side == 0 ? sam.getAlignmentStart() - 1 : sam.getUnclippedEnd());
if ((pos + winShift < clipStart || pos > clipEnd))
continue;
count_clipped[side]++;
if (ce.getLength() >= this.min_clip_length) {
count_big_clip[side]++;
}
sample_alleles.add(alternate_alleles[side]);
gb.attribute("CN" + (side == 0 ? 5 : 3), count_clipped[side]);
}
}
// if(!(found_one_big_clip[0] || found_one_big_clip[1])) continue;
if (count_clipped[0] + count_clipped[1] == 0)
continue;
if ((count_clipped[0] + count_clipped[1]) > min_depth) {
found_one_depth_ok = true;
++samples_with_high_depth;
}
sum_depth += (count_clipped[0] + count_clipped[1]);
gb.alleles(new ArrayList<Allele>(sample_alleles));
all_alleles.addAll(sample_alleles);
gb.DP(count_clipped[0] + count_clipped[1]);
genotypes.add(gb.make());
}
if (all_alleles.size() == 1) {
// all homozygotes
continue;
}
if (!found_one_depth_ok) {
continue;
}
if (!(count_big_clip[0] >= 1 || count_big_clip[1] >= 1)) {
continue;
}
Map<String, Object> atts = new HashMap<String, Object>();
atts.put("COUNT_SAMPLES", samples_with_high_depth);
atts.put(VCFConstants.DEPTH_KEY, sum_depth);
VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(buffer.getFirst().getReferenceName());
vcb.start(pos);
vcb.stop(pos + winShift);
vcb.alleles(all_alleles);
vcb.attributes(atts);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
buffer.clear();
}
if (rec == null) {
break;
}
}
buffer.add(rec);
}
merginIter.close();
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
for (Input input : inputs) {
CloserUtil.close(input);
}
}
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VCFPolyX method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VcfIterator r, final VariantContextWriter delegate) {
final VariantContextWriter w = this.component.open(delegate);
w.writeHeader(r.getHeader());
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(r.getHeader()).logger(LOG);
while (r.hasNext()) {
w.add(progress.watch(r.next()));
}
progress.finish();
w.close();
return RETURN_OK;
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VCFShuffle method doWork.
@Override
public int doWork(final List<String> args) {
if (seed == -1L)
seed = System.currentTimeMillis();
SortingCollection<RLine> shuffled = null;
VariantContextWriter out = null;
BufferedReader lr = null;
try {
lr = super.openBufferedReader(oneFileOrNull(args));
out = super.openVariantContextWriter(this.outputFile);
final Random random = new Random(this.seed);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(lr);
final VCFHeader header = cah.header;
super.addMetaData(header);
out.writeHeader(header);
LOG.info("shuffling");
shuffled = SortingCollection.newInstance(RLine.class, new RLineCodec(), (o1, o2) -> {
final int i = o1.rand < o2.rand ? -1 : o1.rand > o2.rand ? 1 : 0;
if (i != 0)
return i;
return o1.line.compareTo(o2.line);
}, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
shuffled.setDestructiveIteration(true);
String line;
while ((line = lr.readLine()) != null) {
final RLine rLine = new RLine();
rLine.rand = random.nextLong();
rLine.line = line;
shuffled.add(rLine);
}
shuffled.doneAdding();
LOG.info("done shuffling");
final CloseableIterator<RLine> iter = shuffled.iterator();
while (iter.hasNext()) {
final VariantContext ctx = cah.codec.decode(iter.next().line);
out.add(ctx);
if (out.checkError())
break;
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
if (shuffled != null)
shuffled.cleanup();
CloserUtil.close(shuffled);
CloserUtil.close(lr);
CloserUtil.close(out);
}
}
use of htsjdk.variant.variantcontext.writer.VariantContextWriter in project jvarkit by lindenb.
the class VcfSimulator method doWork.
@Override
public int doWork(final List<String> args) {
if (this.indexedFastaSequenceFile == null) {
LOG.error("Reference is undefined");
return -1;
}
if (!args.isEmpty()) {
LOG.error("too many arguments");
return -1;
}
if (this.randomSeed == -1L) {
this.random = new Random(System.currentTimeMillis());
} else {
this.random = new Random(this.randomSeed);
}
if (this.numSamples < 0) {
this.numSamples = 1 + this.random.nextInt(10);
}
while (this.samples.size() < numSamples) this.samples.add("SAMPLE" + (1 + this.samples.size()));
VariantContextWriter writer = null;
PrintStream pw = null;
try {
final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT", "DP");
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "AF", "AN", "AC", "DP");
VariantAttributesRecalculator calc = new VariantAttributesRecalculator();
final VCFHeader header = new VCFHeader(metaData, this.samples);
header.setSequenceDictionary(this.indexedFastaSequenceFile.getSequenceDictionary());
calc.setHeader(header);
pw = super.openFileOrStdoutAsPrintStream(this.outputFile);
writer = VCFUtils.createVariantContextWriterToOutputStream(pw);
writer.writeHeader(header);
long countVariantsSoFar = 0;
for (; ; ) {
if (pw.checkError())
break;
if (this.numberOfVariants >= 0 && countVariantsSoFar >= this.numberOfVariants)
break;
for (final SAMSequenceRecord ssr : this.indexedFastaSequenceFile.getSequenceDictionary().getSequences()) {
if (pw.checkError())
break;
final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
for (int pos = 1; pos <= ssr.getSequenceLength(); ++pos) {
if (pw.checkError())
break;
char REF = Character.toUpperCase(genomic.charAt(pos - 1));
if (REF == 'N')
continue;
char ALT = 'N';
switch(REF) {
case 'A':
ALT = "TGC".charAt(random.nextInt(3));
break;
case 'T':
ALT = "AGC".charAt(random.nextInt(3));
break;
case 'G':
ALT = "TAC".charAt(random.nextInt(3));
break;
case 'C':
ALT = "TGA".charAt(random.nextInt(3));
break;
default:
ALT = 'N';
}
if (ALT == 'N')
continue;
final Allele refAllele = Allele.create((byte) REF, true);
Allele altAllele = Allele.create((byte) ALT, false);
final VariantContextBuilder cb = new VariantContextBuilder();
cb.chr(genomic.getChrom());
cb.start(pos);
cb.stop(pos);
List<Genotype> genotypes = new ArrayList<Genotype>(samples.size());
for (String sample : samples) {
final Allele a1 = (random.nextBoolean() ? refAllele : altAllele);
final Allele a2 = (random.nextBoolean() ? refAllele : altAllele);
GenotypeBuilder gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
if (random.nextBoolean()) {
gb = new GenotypeBuilder(sample, Arrays.asList(a1, a2));
gb.DP(1 + random.nextInt(50));
}
genotypes.add(gb.make());
}
cb.genotypes(genotypes);
cb.alleles(Arrays.asList(refAllele, altAllele));
writer.add(calc.apply(cb.make()));
countVariantsSoFar++;
}
}
}
writer.close();
writer = null;
pw.flush();
pw.close();
pw = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(pw);
CloserUtil.close(writer);
}
}
Aggregations