Search in sources :

Example 1 with TabbedTextFileWithHeaderParser

use of org.broadinstitute.hellbender.utils.text.parsers.TabbedTextFileWithHeaderParser in project gatk by broadinstitute.

the class RefFlatReader method load.

OverlapDetector<Gene> load() {
    final OverlapDetector<Gene> overlapDetector = new OverlapDetector<>(0, 0);
    final int expectedColumns = RefFlatColumns.values().length;
    final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
    final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene = new LinkedHashMap<>();
    for (final TabbedTextFileWithHeaderParser.Row row : parser) {
        // getCurrentLineNumber returns the number of the next line
        final int lineNumber = parser.getCurrentLineNumber();
        if (row.getFields().length != expectedColumns) {
            throw new GeneAnnotationException("Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
        }
        final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
        final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
        final String transcriptDescription = geneName + ":" + transcriptName;
        final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
        if (!isSequenceRecognized(chromosome)) {
            LOG.debug("Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
        } else {
            List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
            if (transcriptLines == null) {
                transcriptLines = new ArrayList<>();
                refFlatLinesByGene.put(geneName, transcriptLines);
            }
            transcriptLines.add(row);
        }
    }
    int longestInterval = 0;
    int numIntervalsOver1MB = 0;
    for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines : refFlatLinesByGene.values()) {
        try {
            final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
            overlapDetector.addLhs(gene, gene);
            if (gene.length() > longestInterval)
                longestInterval = gene.length();
            if (gene.length() > 1000000)
                ++numIntervalsOver1MB;
        } catch (Exception e) {
            LOG.debug(e.getMessage() + " -- skipping");
        }
    }
    LOG.debug("Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
    return overlapDetector;
}
Also used : TabbedTextFileWithHeaderParser(org.broadinstitute.hellbender.utils.text.parsers.TabbedTextFileWithHeaderParser) OverlapDetector(htsjdk.samtools.util.OverlapDetector)

Aggregations

OverlapDetector (htsjdk.samtools.util.OverlapDetector)1 TabbedTextFileWithHeaderParser (org.broadinstitute.hellbender.utils.text.parsers.TabbedTextFileWithHeaderParser)1