Search in sources :

Example 26 with MutableInt

use of org.apache.commons.lang3.mutable.MutableInt in project tika by apache.

the class TokenCounter method _add.

private void _add(String field, Analyzer analyzer, String content) throws IOException {
    int totalTokens = 0;
    TokenStream ts = analyzer.tokenStream(field, content);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();
    Map<String, MutableInt> tokenMap = map.get(field);
    if (tokenMap == null) {
        tokenMap = new HashMap<>();
        map.put(field, tokenMap);
    }
    while (ts.incrementToken()) {
        String token = termAtt.toString();
        MutableInt cnt = tokenMap.get(token);
        if (cnt == null) {
            cnt = new MutableInt(1);
            tokenMap.put(token, cnt);
        } else {
            cnt.increment();
        }
        totalTokens++;
    }
    ts.close();
    ts.end();
    int totalUniqueTokens = tokenMap.size();
    double ent = 0.0d;
    double p = 0.0d;
    double base = 2.0;
    TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
    SummaryStatistics summaryStatistics = new SummaryStatistics();
    for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
        String token = e.getKey();
        int termFreq = e.getValue().intValue();
        p = (double) termFreq / (double) totalTokens;
        ent += p * FastMath.log(base, p);
        int len = token.codePointCount(0, token.length());
        for (int i = 0; i < e.getValue().intValue(); i++) {
            summaryStatistics.addValue(len);
        }
        if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
            queue.insertWithOverflow(new TokenIntPair(token, termFreq));
        }
    }
    if (totalTokens > 0) {
        ent = (-1.0d / (double) totalTokens) * ent;
    }
    /*            Collections.sort(allTokens);
            List<TokenIntPair> topNList = new ArrayList<>(topN);
            for (int i = 0; i < topN && i < allTokens.size(); i++) {
                topNList.add(allTokens.get(i));
            }*/
    tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) SummaryStatistics(org.apache.commons.math3.stat.descriptive.SummaryStatistics) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) MutableInt(org.apache.commons.lang3.mutable.MutableInt) Map(java.util.Map) HashMap(java.util.HashMap)

Example 27 with MutableInt

use of org.apache.commons.lang3.mutable.MutableInt in project tika by apache.

the class TokenContraster method calculateContrastStatistics.

public ContrastStatistics calculateContrastStatistics(Map<String, MutableInt> mapA, TokenStatistics tokenStatisticsA, Map<String, MutableInt> mapB, TokenStatistics tokenStatisticsB) {
    reset();
    this.tokenStatisticsA = tokenStatisticsA;
    this.tokenStatisticsB = tokenStatisticsB;
    for (Map.Entry<String, MutableInt> e : mapA.entrySet()) {
        MutableInt bVal = mapB.get(e.getKey());
        int b = (bVal == null) ? 0 : bVal.intValue();
        add(e.getKey(), e.getValue().intValue(), b);
    }
    for (Map.Entry<String, MutableInt> e : mapB.entrySet()) {
        if (mapA.containsKey(e.getKey())) {
            continue;
        }
        add(e.getKey(), 0, e.getValue().intValue());
    }
    finishComputing();
    ContrastStatistics contrastStatistics = new ContrastStatistics();
    contrastStatistics.setDiceCoefficient(diceCoefficient);
    contrastStatistics.setOverlap(overlap);
    contrastStatistics.setTopNUniqueA(uniqA.getArray());
    contrastStatistics.setTopNUniqueB(uniqB.getArray());
    contrastStatistics.setTopNMoreA(moreA.getArray());
    contrastStatistics.setTopNMoreB(moreB.getArray());
    return contrastStatistics;
}
Also used : MutableInt(org.apache.commons.lang3.mutable.MutableInt) Map(java.util.Map)

Example 28 with MutableInt

use of org.apache.commons.lang3.mutable.MutableInt in project gatk-protected by broadinstitute.

the class AllelicPanelOfNormalsCreator method create.

/**
     * Creates an {@link AllelicPanelOfNormals} given a site-frequency threshold;
     * sites appearing in strictly less than this fraction of samples will not be included in the panel of normals.
     * @param siteFrequencyThreshold    site-frequency threshold
     * @return                          an {@link AllelicPanelOfNormals} containing sites
     *                                  above the site-frequency threshold
     */
public AllelicPanelOfNormals create(final double siteFrequencyThreshold) {
    logger.info("Creating allelic panel of normals...");
    //used to filter on frequency
    final Map<SimpleInterval, MutableInt> numberOfSamplesMap = new HashMap<>();
    //store only the total counts (smaller memory footprint)
    final Map<SimpleInterval, AllelicCount> totalCountsMap = new HashMap<>();
    int pulldownFileCounter = 1;
    final int totalNumberOfSamples = pulldownFiles.size();
    for (final File pulldownFile : pulldownFiles) {
        logger.info("Processing pulldown file " + pulldownFileCounter++ + "/" + totalNumberOfSamples + " (" + pulldownFile + ")...");
        final AllelicCountCollection pulldownCounts = new AllelicCountCollection(pulldownFile);
        for (final AllelicCount count : pulldownCounts.getCounts()) {
            //update the sum of ref and alt counts at each site
            final SimpleInterval site = count.getInterval();
            final AllelicCount currentCountAtSite = totalCountsMap.getOrDefault(site, new AllelicCount(site, 0, 0));
            final AllelicCount updatedCountAtSite = new AllelicCount(site, currentCountAtSite.getRefReadCount() + count.getRefReadCount(), currentCountAtSite.getAltReadCount() + count.getAltReadCount());
            totalCountsMap.put(site, updatedCountAtSite);
            //update the number of samples seen possessing each site
            final MutableInt numberOfSamplesAtSite = numberOfSamplesMap.get(site);
            if (numberOfSamplesAtSite == null) {
                numberOfSamplesMap.put(site, new MutableInt(1));
            } else {
                numberOfSamplesAtSite.increment();
            }
        }
    }
    logger.info("Total number of unique sites present in samples: " + totalCountsMap.size());
    //filter out sites that appear at a frequency strictly less than the provided threshold
    final AllelicCountCollection totalCounts = new AllelicCountCollection();
    numberOfSamplesMap.entrySet().stream().filter(e -> e.getValue().doubleValue() / totalNumberOfSamples >= siteFrequencyThreshold).map(e -> totalCountsMap.get(e.getKey())).forEach(totalCounts::add);
    logger.info(String.format("Number of unique sites present in samples above site frequency = %4.3f: %d", siteFrequencyThreshold, totalCounts.getCounts().size()));
    return new AllelicPanelOfNormals(totalCounts);
}
Also used : MutableInt(org.apache.commons.lang3.mutable.MutableInt) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount) HashMap(java.util.HashMap) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) ArrayList(java.util.ArrayList) List(java.util.List) Logger(org.apache.logging.log4j.Logger) Map(java.util.Map) Utils(org.broadinstitute.hellbender.utils.Utils) LogManager(org.apache.logging.log4j.LogManager) AllelicCountCollection(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCountCollection) HashMap(java.util.HashMap) MutableInt(org.apache.commons.lang3.mutable.MutableInt) AllelicCountCollection(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCountCollection) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) File(java.io.File) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)

Aggregations

MutableInt (org.apache.commons.lang3.mutable.MutableInt)28 Test (org.junit.Test)10 HashMap (java.util.HashMap)5 MutableBoolean (org.apache.commons.lang3.mutable.MutableBoolean)5 Map (java.util.Map)4 TimeoutException (java.util.concurrent.TimeoutException)4 File (java.io.File)3 ArrayList (java.util.ArrayList)3 List (java.util.List)2 ILogicalOperator (org.apache.hyracks.algebricks.core.algebra.base.ILogicalOperator)2 LogManager (org.apache.logging.log4j.LogManager)2 Logger (org.apache.logging.log4j.Logger)2 AllelicCount (org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)2 AllelicCountCollection (org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCountCollection)2 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)2 Utils (org.broadinstitute.hellbender.utils.Utils)2 IOUtils (org.broadinstitute.hellbender.utils.io.IOUtils)2 ParamUtils (org.broadinstitute.hellbender.utils.param.ParamUtils)2 GraphStoreFixture (org.neo4j.consistency.checking.GraphStoreFixture)2 IdGenerator (org.neo4j.consistency.checking.GraphStoreFixture.IdGenerator)2