use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.
the class EigenInfoAnnotator method getAnnotations.
@Override
public Map<String, Object> getAnnotations(final VariantContext ctx) {
if (!ctx.isVariant())
return Collections.emptyMap();
final int contig = contig2eigen(ctx.getContig());
if (contig < 1)
return Collections.emptyMap();
try {
if (this.codingFeatureReader == null) {
this.codingFeatureReader = new TabixFeatureReader<>(new File(this.eigenDirectory, getTabixPrefix() + "coding_annot_04092016.tab.bgz").getPath(), new CodingFeatureCodec());
}
if (this.prev_contig == -1 || prev_contig != contig) {
CloserUtil.close(this.nonCodingFeatureReader);
this.nonCodingFeatureReader = new TabixFeatureReader<>(getNonCodingFileForContig(contig).getPath(), new NonCodingFeatureCodec());
this.prev_contig = contig;
}
final Map<Allele, NonCodingFeature> alt2nonCoding = new HashMap<>();
CloseableTribbleIterator<NonCodingFeature> iter1 = this.nonCodingFeatureReader.query(String.valueOf(contig), ctx.getStart(), ctx.getEnd());
while (iter1.hasNext()) {
NonCodingFeature feat = iter1.next();
if (feat == null || !feat.accept(ctx))
continue;
alt2nonCoding.put(feat.alt, feat);
}
iter1.close();
final Map<Allele, CodingFeature> alt2coding = new HashMap<>();
CloseableTribbleIterator<CodingFeature> iter2 = this.codingFeatureReader.query(String.valueOf(contig), ctx.getStart(), ctx.getEnd());
while (iter2.hasNext()) {
CodingFeature feat = iter2.next();
if (feat == null || !feat.accept(ctx))
continue;
alt2coding.put(feat.alt, feat);
}
iter2.close();
if (alt2nonCoding.isEmpty() && alt2coding.isEmpty())
return Collections.emptyMap();
final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
final Map<String, Object> map = new HashMap<>();
for (int side = 0; side < 2; ++side) {
for (int i = 0; i < (side == 0 ? noncodingheaderlines.size() : codingheaderlines.size()); ++i) {
final VCFInfoHeaderLine vihl = (side == 0 ? noncodingheaderlines.get(i) : codingheaderlines.get(i));
final List<Object> atts = new ArrayList<>(alternateAlleles.size());
boolean found_one = false;
for (int altn = 0; altn < alternateAlleles.size(); ++altn) {
final Allele alt = alternateAlleles.get(altn);
final AbstractFeature feat = (side == 0 ? alt2nonCoding.get(alt) : alt2coding.get(alt));
if (feat == null) {
atts.add(".");
continue;
}
final String token = feat.get(4 + i);
if (token == null || token.isEmpty() || token.equals(".")) {
atts.add(".");
continue;
} else if (vihl.getType() == VCFHeaderLineType.String) {
found_one = true;
atts.add(token.replace(',', '|'));
} else {
try {
Float dbl = new Float(token);
atts.add(dbl);
found_one = true;
} catch (NumberFormatException err) {
throw new IOException("Cannot cast " + token + " to float for " + vihl);
}
}
}
if (!found_one) {
// nothing
} else if (vihl.getCountType() == VCFHeaderLineCount.R) {
map.put(vihl.getID(), new ArrayList<>(new LinkedHashSet<>(atts)));
} else {
if (atts.size() != ctx.getAlternateAlleles().size()) {
System.err.println("Number of eigen data!=number of ALTS");
}
map.put(vihl.getID(), atts);
}
}
}
return map;
} catch (final IOException err) {
throw new RuntimeException(err);
}
}
use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.
the class VCFFixIndels method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator r, VariantContextWriter w) {
long nChanged = 0L;
final String TAG = "INDELFIXED";
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
addMetaData(h2);
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS (position|alleles...)"));
w.writeHeader(h2);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
while (r.hasNext()) {
boolean somethingChanged = false;
VariantContext ctx = progress.watch(r.next());
/* map old allele to new allele */
Map<Allele, Allele> original2modified = new HashMap<Allele, Allele>();
/* did we found a strange allele (symbolic, etc ) */
boolean strange_allele_flag = false;
for (final Allele a : ctx.getAlleles()) {
original2modified.put(a, a);
if (a.isSymbolic() || a.isNoCall()) {
strange_allele_flag = true;
break;
}
}
if (strange_allele_flag || original2modified.size() < 2) /* at least 2 alleles: REF+ ALT */
{
w.add(ctx);
continue;
}
/* record chromStart if we need to shift to the right */
int chromStart = ctx.getStart();
/* trim right then left */
for (int side = 0; side < 2; ++side) {
boolean done = false;
while (!done) {
boolean ok_side = true;
/* common nucleotide at end/start */
Character targetChar = null;
done = true;
// scan side
Set<Allele> keys = new HashSet<>(original2modified.keySet());
for (Allele k : keys) {
Allele newAllele = original2modified.get(k);
if (newAllele.isSymbolic()) {
ok_side = false;
break;
}
String baseString = newAllele.getBaseString().trim().toUpperCase();
if (baseString.length() < 2) {
ok_side = false;
break;
}
/* first or last char or all sequences
* side==0 : right
* side==1 : left
* */
Character baseChar = (side == 0 ? baseString.charAt(baseString.length() - 1) : baseString.charAt(0));
if (targetChar == null) {
targetChar = baseChar;
} else if (!targetChar.equals(baseChar)) {
/* doesn't end with same nucleotide */
ok_side = false;
break;
}
}
/* ok we can shift all alleles */
if (ok_side && targetChar != null) {
done = false;
somethingChanged = true;
for (Allele k : keys) {
Allele newAllele = original2modified.get(k);
String baseString = newAllele.getBaseString().trim().toUpperCase();
if (// remove last nucleotide
side == 0) {
newAllele = Allele.create(baseString.substring(0, baseString.length() - 1), newAllele.isReference());
} else {
newAllele = Allele.create(baseString.substring(1), newAllele.isReference());
}
original2modified.put(k, newAllele);
}
if (side == 1)
chromStart++;
}
}
/* end of while done */
}
if (!somethingChanged) {
w.add(ctx);
continue;
}
final VariantContextBuilder b = new VariantContextBuilder(ctx);
b.start(chromStart);
Allele newRef = original2modified.get(ctx.getReference());
b.stop(chromStart + newRef.getBaseString().length() - 1);
b.alleles(original2modified.values());
List<Genotype> genotypes = new ArrayList<>();
for (final String sample : header.getSampleNamesInOrder()) {
final Genotype g = ctx.getGenotype(sample);
if (!g.isCalled()) {
genotypes.add(g);
continue;
}
final GenotypeBuilder gb = new GenotypeBuilder(g);
final List<Allele> aL = new ArrayList<>();
for (final Allele a : g.getAlleles()) {
aL.add(original2modified.get(a));
}
gb.alleles(aL);
genotypes.add(gb.make());
}
StringBuilder tagContent = new StringBuilder();
tagContent.append(String.valueOf(ctx.getStart()));
for (Allele a : ctx.getAlleles()) {
tagContent.append("|");
tagContent.append(a.toString());
}
b.attribute(TAG, tagContent.toString());
b.genotypes(genotypes);
w.add(b.make());
++nChanged;
if (w.checkError())
break;
}
progress.finish();
LOG.info("indels changed:" + nChanged);
return RETURN_OK;
}
use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.
the class VCFComm method doWork.
@Override
public int doWork(final List<String> args) {
CloseableIterator<LineAndFile> iter = null;
SortingCollection<LineAndFile> variants = null;
VariantContextWriter w = null;
try {
if (args.isEmpty()) {
LOG.error("Illegal number of arguments");
return -1;
}
Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), new LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpPaths());
variants.setDestructiveIteration(true);
/**
* new sample names in the output vcf: one sample per file
*/
final Map<Integer, String> fileid2sampleName = new TreeMap<>();
/**
* samples names as they appear in the original VCF headers
*/
final Counter<String> countInputSamples = new Counter<String>();
/**
* dicts
*/
final List<SAMSequenceDictionary> all_dictionaries = new ArrayList<>();
for (final String vcffilename : IOUtils.unrollFiles(args)) {
LOG.info("Reading from " + vcffilename);
final Input input = super.put(variants, vcffilename);
String sampleName = vcffilename;
if (sampleName.endsWith(".vcf.gz")) {
sampleName = sampleName.substring(0, sampleName.length() - 7);
} else if (sampleName.endsWith(".vcf.gz")) {
sampleName = sampleName.substring(0, sampleName.length() - 4);
}
int slash = sampleName.lastIndexOf(File.separatorChar);
if (slash != -1)
sampleName = sampleName.substring(slash + 1);
int suffix = 1;
// loop until we find a uniq name
for (; ; ) {
final String key = sampleName + (suffix == 1 ? "" : "_" + suffix);
if (fileid2sampleName.values().contains(key)) {
suffix++;
continue;
}
fileid2sampleName.put(input.file_id, key);
metaData.add(new VCFHeaderLine(key, vcffilename));
break;
}
for (final String sname : input.codecAndHeader.header.getSampleNamesInOrder()) {
countInputSamples.incr(sname);
}
all_dictionaries.add(input.codecAndHeader.header.getSequenceDictionary());
}
variants.doneAdding();
/**
* unique sample name, if any present in all VCF
*/
Optional<String> unqueSampleName = Optional.empty();
if (countInputSamples.getCountCategories() == 1 && countInputSamples.count(countInputSamples.keySet().iterator().next()) == fileid2sampleName.size()) {
unqueSampleName = Optional.of(countInputSamples.keySet().iterator().next());
LOG.info("Unique sample name is " + unqueSampleName.get());
}
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY);
metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
final VCFFilterHeaderLine variantNotCalledInAllVcf = new VCFFilterHeaderLine("NotCalledEveryWhere", "Variant was NOT called in all input VCF");
metaData.add(variantNotCalledInAllVcf);
final VCFFilterHeaderLine variantWasFiltered = new VCFFilterHeaderLine("VariantWasFiltered", "At least one variant was filtered");
metaData.add(variantWasFiltered);
final VCFFormatHeaderLine variantQUALFormat = new VCFFormatHeaderLine("VCQUAL", 1, VCFHeaderLineType.Float, "Variant Quality");
metaData.add(variantQUALFormat);
metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Number of allle in the src vcf"));
metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_COUNT_KEY, 1, VCFHeaderLineType.Integer, "Number of ALT alllele"));
final VCFInfoHeaderLine foundInCountVcfInfo = new VCFInfoHeaderLine("NVCF", 1, VCFHeaderLineType.Integer, "Number of VCF this variant was found");
metaData.add(foundInCountVcfInfo);
final VCFInfoHeaderLine variantTypesInfo = new VCFInfoHeaderLine("VTYPES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Distinct Variants type");
metaData.add(variantTypesInfo);
final VCFFilterHeaderLine multipleTypeFilters = new VCFFilterHeaderLine("DiscordantTypes", "Discordant types at this position");
metaData.add(multipleTypeFilters);
final VCFFormatHeaderLine variantTypeFormat = new VCFFormatHeaderLine("VTYPE", 1, VCFHeaderLineType.String, "Variant Type");
metaData.add(variantTypeFormat);
final VCFFilterHeaderLine uniqueVariantDiscordantGTFilter;
if (unqueSampleName.isPresent()) {
metaData.add(new VCFHeaderLine("UniqSample", unqueSampleName.get()));
uniqueVariantDiscordantGTFilter = new VCFFilterHeaderLine("DiscordantGenotypeForUniqSample", "Genotype Dicordant for for sample " + unqueSampleName.get());
metaData.add(uniqueVariantDiscordantGTFilter);
} else {
uniqueVariantDiscordantGTFilter = null;
}
final VCFHeader header = new VCFHeader(metaData, new ArrayList<>(fileid2sampleName.values()));
if (// all have a dict
!normalize_chr && !all_dictionaries.contains(null)) {
SAMSequenceDictionary thedict = null;
for (int x = 0; x < all_dictionaries.size(); ++x) {
SAMSequenceDictionary d = all_dictionaries.get(x);
if (thedict == null) {
thedict = d;
} else if (!SequenceUtil.areSequenceDictionariesEqual(d, thedict)) {
thedict = null;
break;
}
}
if (thedict != null)
header.setSequenceDictionary(thedict);
}
w = super.openVariantContextWriter(super.outputFile);
w.writeHeader(header);
final List<LineAndFile> row = new ArrayList<LineAndFile>(super.inputs.size());
final Comparator<LineAndFile> posCompare = (A, B) -> A.getContigPosRef().compareTo(B.getContigPosRef());
iter = variants.iterator();
for (; ; ) {
LineAndFile rec = null;
if (iter.hasNext()) {
rec = iter.next();
}
if (rec == null || (!row.isEmpty() && posCompare.compare(row.get(0), rec) != 0)) {
if (!row.isEmpty()) {
final VariantContext first = row.get(0).getContext();
/* in which file id we find this variant */
Set<Integer> fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
// see with HAS multiple chrom/pos/ref but different alt
if (row.size() != fileids_for_variant.size()) {
for (; ; ) {
boolean ok = true;
for (int x = 0; ok && x + 1 < row.size(); ++x) {
final VariantContext ctxx = row.get(x).getContext();
final List<Allele> altsx = ctxx.getAlternateAlleles();
for (int y = x + 1; ok && y < row.size(); ++y) {
if (row.get(x).fileIdx != row.get(y).fileIdx)
continue;
final VariantContext ctxy = row.get(y).getContext();
final List<Allele> altsy = ctxy.getAlternateAlleles();
if (altsx.equals(altsy))
continue;
if (!ctxx.isVariant() && ctxy.isVariant()) {
row.remove(x);
} else if (ctxx.isVariant() && !ctxy.isVariant()) {
row.remove(y);
} else if (!ctxx.isSNP() && ctxy.isSNP()) {
row.remove(x);
} else if (ctxx.isSNP() && !ctxy.isSNP()) {
row.remove(y);
} else if (altsx.size() > altsy.size()) {
row.remove(x);
} else if (altsx.size() < altsy.size()) {
row.remove(y);
} else {
row.remove(y);
}
ok = false;
break;
}
}
if (ok)
break;
}
fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
}
if (row.size() != fileids_for_variant.size()) {
LOG.error("There are some duplicated variants at the position " + new ContigPosRef(first) + " in the same vcf file");
for (final LineAndFile laf : row) {
LOG.error("File [" + laf.fileIdx + "]" + fileid2sampleName.get(laf.fileIdx));
LOG.error("\t" + laf.getContigPosRef());
}
row.clear();
} else {
final Set<Allele> alleles = row.stream().flatMap(R -> R.getContext().getAlleles().stream()).collect(Collectors.toSet());
final VariantContextBuilder vcb = new VariantContextBuilder(getClass().getName(), first.getContig(), first.getStart(), first.getEnd(), alleles);
final Set<String> filters = new HashSet<>();
final Set<VariantContext.Type> variantContextTypes = new HashSet<>();
final List<Genotype> genotypes = new ArrayList<Genotype>();
for (final LineAndFile laf : row) {
if (laf.getContext().isFiltered())
filters.add(variantWasFiltered.getID());
variantContextTypes.add(laf.getContext().getType());
final GenotypeBuilder gbuilder = new GenotypeBuilder();
gbuilder.name(fileid2sampleName.get(laf.fileIdx));
if (unqueSampleName.isPresent()) {
final Genotype g0 = laf.getContext().getGenotype(unqueSampleName.get());
if (g0 == null) {
iter.close();
w.close();
throw new IllegalStateException("Cannot find genotype for " + unqueSampleName.get());
}
if (g0.hasDP())
gbuilder.DP(g0.getDP());
if (g0.hasGQ())
gbuilder.GQ(g0.getGQ());
gbuilder.alleles(g0.getAlleles());
} else {
gbuilder.alleles(Arrays.asList(first.getReference(), first.getReference()));
if (laf.getContext().hasAttribute(VCFConstants.DEPTH_KEY)) {
gbuilder.DP(laf.getContext().getAttributeAsInt(VCFConstants.DEPTH_KEY, 0));
}
}
if (laf.getContext().isFiltered()) {
gbuilder.filter("VCFFILTERED");
}
if (laf.getContext().hasLog10PError()) {
gbuilder.attribute(variantQUALFormat.getID(), laf.getContext().getPhredScaledQual());
}
gbuilder.attribute(VCFConstants.ALLELE_NUMBER_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !A.isNoCall()).count());
gbuilder.attribute(VCFConstants.ALLELE_COUNT_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).count());
gbuilder.attribute(variantTypeFormat.getID(), laf.getContext().getType().name());
genotypes.add(gbuilder.make());
}
final String id = String.join(";", row.stream().map(LAF -> LAF.getContext()).filter(V -> V.hasID()).map(V -> V.getID()).collect(Collectors.toSet()));
if (!id.isEmpty())
vcb.id(id);
vcb.genotypes(genotypes);
if (unqueSampleName.isPresent()) {
boolean all_same = true;
for (int x = 0; all_same && x + 1 < genotypes.size(); ++x) {
if (!genotypes.get(x).isCalled())
continue;
for (int y = x + 1; all_same && y < genotypes.size(); ++y) {
if (!genotypes.get(y).isCalled())
continue;
if (!genotypes.get(x).sameGenotype(genotypes.get(y), true)) {
all_same = false;
break;
}
}
}
if (!all_same)
filters.add(uniqueVariantDiscordantGTFilter.getID());
}
// Add AN
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, genotypes.stream().filter(G -> G.isCalled()).mapToInt(G -> G.getAlleles().size()).sum());
if (!variantContextTypes.isEmpty()) {
vcb.attribute(variantTypesInfo.getID(), new ArrayList<>(variantContextTypes.stream().map(T -> T.name()).collect(Collectors.toSet())));
if (variantContextTypes.size() > 1) {
filters.add(multipleTypeFilters.getID());
}
}
vcb.attribute(foundInCountVcfInfo.getID(), fileids_for_variant.size());
boolean print = true;
if (row.size() == super.inputs.size() && ignore_everywhere) {
print = false;
}
if (fileids_for_variant.size() != fileid2sampleName.size()) {
filters.add(variantNotCalledInAllVcf.getID());
if (only_everywhere) {
print = false;
}
}
vcb.filters(filters);
if (print) {
w.add(vcb.make());
}
}
row.clear();
}
if (rec == null)
break;
}
row.add(rec);
}
iter.close();
iter = null;
w.close();
w = null;
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
CloserUtil.close(w);
try {
if (variants != null)
variants.cleanup();
} catch (Exception err) {
}
}
}
use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.
the class VcfLiftOver method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
VariantContextWriter failed = null;
GenomicSequence genomicSequence = null;
try {
final VCFHeader inputHeader = in.getHeader();
final Set<VCFHeaderLine> headerLines = inputHeader.getMetaDataInInputOrder().stream().filter(V -> {
if (!(V instanceof VCFInfoHeaderLine))
return true;
final VCFInfoHeaderLine vih = VCFInfoHeaderLine.class.cast(V);
if (removeInfo.contains(vih.getID()))
return false;
return true;
}).collect(Collectors.toSet());
if (this.failedFile != null) {
final VCFHeader header2 = new VCFHeader(headerLines, inputHeader.getSampleNamesInOrder());
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header2.addMetaDataLine(new VCFInfoHeaderLine(this.failedinfoTag, 1, VCFHeaderLineType.String, "Why the liftOver failed."));
failed = super.openVariantContextWriter(failedFile);
failed.writeHeader(header2);
}
final VCFHeader header3 = new VCFHeader(headerLines, inputHeader.getSampleNamesInOrder());
header3.setSequenceDictionary(this.indexedFastaSequenceFile.getSequenceDictionary());
header3.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header3.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header3.addMetaDataLine(new VCFInfoHeaderLine(this.infoTag, 1, VCFHeaderLineType.String, "Chromosome|Position before liftOver."));
out.writeHeader(header3);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
while (in.hasNext()) {
VariantContext ctx = progress.watch(in.next());
if (!this.removeInfo.isEmpty()) {
VariantContextBuilder vcb = new VariantContextBuilder(ctx);
for (final String tag : this.removeInfo) vcb.rmAttribute(tag);
ctx = vcb.make();
}
if (ctx.isIndel() && this.ignoreIndels) {
if (failed != null)
failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "Indel").make());
continue;
}
if (adaptivematch) {
double minAlleleLength = Math.min(0, ctx.getAlleles().stream().mapToInt(A -> A.length()).min().orElse(0));
double maxAlleleLength = Math.max(1, ctx.getAlleles().stream().mapToInt(A -> A.length()).max().orElse(1));
this.liftOver.setLiftOverMinMatch(minAlleleLength / maxAlleleLength);
}
final Interval lifted = liftOver.liftOver(new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), // negative strand
false, String.join("|", ctx.getContig(), String.valueOf(ctx.getStart()), ctx.getReference().toString())));
if (lifted == null) {
if (failed != null)
failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "LiftOverFailed").make());
} else if (this.indexedFastaSequenceFile.getSequenceDictionary().getSequence(lifted.getContig()) == null) {
if (failed != null)
failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "ContigMissingDictionary|" + lifted.getContig()).make());
} else {
boolean alleleAreValidatedVsRef = true;
// part of the code was copied from picard/liftovervcf
final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<>();
final List<Allele> alleles = new ArrayList<Allele>();
for (final Allele oldAllele : ctx.getAlleles()) {
final Allele fixedAllele;
if (oldAllele.isSymbolic() || oldAllele.isNoCall() || oldAllele.equals(Allele.SPAN_DEL)) {
alleles.add(oldAllele);
continue;
} else if (lifted.isPositiveStrand()) {
fixedAllele = oldAllele;
alleles.add(oldAllele);
} else {
fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference());
alleles.add(fixedAllele);
reverseComplementAlleleMap.put(oldAllele, fixedAllele);
}
if (this.checkAlleleSequence) {
if (genomicSequence == null || !genomicSequence.getChrom().equals(lifted.getContig())) {
genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, lifted.getContig());
}
final String alleleStr = fixedAllele.getBaseString();
int x = 0;
while (x < alleleStr.length() && lifted.getStart() - 1 + x < genomicSequence.length()) {
final char refChar = genomicSequence.charAt(lifted.getStart() - 1 + x);
if (Character.toLowerCase(refChar) != Character.toLowerCase(alleleStr.charAt(x))) {
alleleAreValidatedVsRef = false;
break;
}
++x;
}
if (x != alleleStr.length()) {
alleleAreValidatedVsRef = false;
break;
}
}
}
if (!alleleAreValidatedVsRef) {
if (failed != null)
failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "AlleleMismatchRef").make());
continue;
}
if (lifted.getEnd() - lifted.getStart() != ctx.getEnd() - ctx.getStart()) {
if (failed != null)
failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "AlleleBadLength|" + lifted.length()).make());
continue;
}
final VariantContextBuilder vcb = new VariantContextBuilder(ctx.getSource(), lifted.getContig(), lifted.getStart(), lifted.getEnd(), alleles);
vcb.id(ctx.getID());
vcb.attributes(ctx.getAttributes());
vcb.attribute(this.infoTag, ctx.getContig() + "|" + ctx.getStart() + "|" + ctx.getReference().getDisplayString());
vcb.filters(ctx.getFilters());
vcb.log10PError(ctx.getLog10PError());
final GenotypesContext genotypeContext = ctx.getGenotypes();
final GenotypesContext fixedGenotypes = GenotypesContext.create(genotypeContext.size());
for (final Genotype genotype : genotypeContext) {
final List<Allele> fixedAlleles = new ArrayList<Allele>();
for (final Allele allele : genotype.getAlleles()) {
final Allele fixedAllele = reverseComplementAlleleMap.containsKey(allele) ? reverseComplementAlleleMap.get(allele) : allele;
fixedAlleles.add(fixedAllele);
}
fixedGenotypes.add(new GenotypeBuilder(genotype).alleles(fixedAlleles).make());
}
vcb.genotypes(fixedGenotypes);
out.add(vcb.make());
}
}
if (failed != null) {
failed.close();
failed = null;
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(failed);
}
}
use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.
the class VcfJaspar method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
final String ATT = "JASPAR";
GenomicSequence genomicSequence = null;
final VCFHeader header = new VCFHeader(in.getHeader());
addMetaData(header);
header.addMetaDataLine(new VCFInfoHeaderLine(ATT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Jaspar pattern overlapping: Format: (Name|length|Score/1000|pos|strand)"));
out.writeHeader(header);
while (in.hasNext()) {
VariantContext var = in.next();
if (genomicSequence == null || !genomicSequence.getChrom().equals(var.getContig())) {
LOG.info("Loading sequence " + var.getContig());
genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, var.getContig());
}
final Set<String> hits = new HashSet<String>();
for (final Matrix matrix : this.jasparDb) {
int start0 = Math.max(0, var.getStart() - matrix.length());
for (int y = start0; y < var.getStart() && y + matrix.length() <= genomicSequence.length(); ++y) {
CharSequence forward = new SubSequence(genomicSequence, y, y + matrix.length());
CharSequence revcomp = new RevCompCharSequence(forward);
// run each strand
for (int strand = 0; strand < 2; ++strand) {
double score = matrix.score(strand == 0 ? forward : revcomp);
if (score <= 0)
continue;
if (score >= matrix.max() * this.fraction_of_max) {
StringBuilder b = new StringBuilder("(");
b.append(matrix.getName().replaceAll("[ \t;=]+", "/"));
b.append("|");
b.append(matrix.length());
b.append("|");
b.append((int) (1000.0 * (score / matrix.max())));
b.append("|");
b.append(y + 1);
b.append("|");
b.append(strand == 0 ? '+' : '-');
b.append(")");
hits.add(b.toString());
break;
}
}
}
}
if (hits.isEmpty()) {
out.add(var);
continue;
}
final VariantContextBuilder vcb = new VariantContextBuilder(var);
vcb.attribute(ATT, hits.toArray(new String[hits.size()]));
out.add(vcb.make());
}
return RETURN_OK;
}
Aggregations