use of org.apache.commons.lang3.mutable.MutableInt in project tika by apache.
the class TokenCounter method _add.
private void _add(String field, Analyzer analyzer, String content) throws IOException {
int totalTokens = 0;
TokenStream ts = analyzer.tokenStream(field, content);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
ts.reset();
Map<String, MutableInt> tokenMap = map.get(field);
if (tokenMap == null) {
tokenMap = new HashMap<>();
map.put(field, tokenMap);
}
while (ts.incrementToken()) {
String token = termAtt.toString();
MutableInt cnt = tokenMap.get(token);
if (cnt == null) {
cnt = new MutableInt(1);
tokenMap.put(token, cnt);
} else {
cnt.increment();
}
totalTokens++;
}
ts.close();
ts.end();
int totalUniqueTokens = tokenMap.size();
double ent = 0.0d;
double p = 0.0d;
double base = 2.0;
TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
SummaryStatistics summaryStatistics = new SummaryStatistics();
for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
String token = e.getKey();
int termFreq = e.getValue().intValue();
p = (double) termFreq / (double) totalTokens;
ent += p * FastMath.log(base, p);
int len = token.codePointCount(0, token.length());
for (int i = 0; i < e.getValue().intValue(); i++) {
summaryStatistics.addValue(len);
}
if (queue.top() == null || queue.size() < topN || termFreq >= queue.top().getValue()) {
queue.insertWithOverflow(new TokenIntPair(token, termFreq));
}
}
if (totalTokens > 0) {
ent = (-1.0d / (double) totalTokens) * ent;
}
/* Collections.sort(allTokens);
List<TokenIntPair> topNList = new ArrayList<>(topN);
for (int i = 0; i < topN && i < allTokens.size(); i++) {
topNList.add(allTokens.get(i));
}*/
tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, queue.getArray(), ent, summaryStatistics));
}
use of org.apache.commons.lang3.mutable.MutableInt in project tika by apache.
the class TokenContraster method calculateContrastStatistics.
public ContrastStatistics calculateContrastStatistics(Map<String, MutableInt> mapA, TokenStatistics tokenStatisticsA, Map<String, MutableInt> mapB, TokenStatistics tokenStatisticsB) {
reset();
this.tokenStatisticsA = tokenStatisticsA;
this.tokenStatisticsB = tokenStatisticsB;
for (Map.Entry<String, MutableInt> e : mapA.entrySet()) {
MutableInt bVal = mapB.get(e.getKey());
int b = (bVal == null) ? 0 : bVal.intValue();
add(e.getKey(), e.getValue().intValue(), b);
}
for (Map.Entry<String, MutableInt> e : mapB.entrySet()) {
if (mapA.containsKey(e.getKey())) {
continue;
}
add(e.getKey(), 0, e.getValue().intValue());
}
finishComputing();
ContrastStatistics contrastStatistics = new ContrastStatistics();
contrastStatistics.setDiceCoefficient(diceCoefficient);
contrastStatistics.setOverlap(overlap);
contrastStatistics.setTopNUniqueA(uniqA.getArray());
contrastStatistics.setTopNUniqueB(uniqB.getArray());
contrastStatistics.setTopNMoreA(moreA.getArray());
contrastStatistics.setTopNMoreB(moreB.getArray());
return contrastStatistics;
}
use of org.apache.commons.lang3.mutable.MutableInt in project gatk-protected by broadinstitute.
the class AllelicPanelOfNormalsCreator method create.
/**
* Creates an {@link AllelicPanelOfNormals} given a site-frequency threshold;
* sites appearing in strictly less than this fraction of samples will not be included in the panel of normals.
* @param siteFrequencyThreshold site-frequency threshold
* @return an {@link AllelicPanelOfNormals} containing sites
* above the site-frequency threshold
*/
public AllelicPanelOfNormals create(final double siteFrequencyThreshold) {
logger.info("Creating allelic panel of normals...");
//used to filter on frequency
final Map<SimpleInterval, MutableInt> numberOfSamplesMap = new HashMap<>();
//store only the total counts (smaller memory footprint)
final Map<SimpleInterval, AllelicCount> totalCountsMap = new HashMap<>();
int pulldownFileCounter = 1;
final int totalNumberOfSamples = pulldownFiles.size();
for (final File pulldownFile : pulldownFiles) {
logger.info("Processing pulldown file " + pulldownFileCounter++ + "/" + totalNumberOfSamples + " (" + pulldownFile + ")...");
final AllelicCountCollection pulldownCounts = new AllelicCountCollection(pulldownFile);
for (final AllelicCount count : pulldownCounts.getCounts()) {
//update the sum of ref and alt counts at each site
final SimpleInterval site = count.getInterval();
final AllelicCount currentCountAtSite = totalCountsMap.getOrDefault(site, new AllelicCount(site, 0, 0));
final AllelicCount updatedCountAtSite = new AllelicCount(site, currentCountAtSite.getRefReadCount() + count.getRefReadCount(), currentCountAtSite.getAltReadCount() + count.getAltReadCount());
totalCountsMap.put(site, updatedCountAtSite);
//update the number of samples seen possessing each site
final MutableInt numberOfSamplesAtSite = numberOfSamplesMap.get(site);
if (numberOfSamplesAtSite == null) {
numberOfSamplesMap.put(site, new MutableInt(1));
} else {
numberOfSamplesAtSite.increment();
}
}
}
logger.info("Total number of unique sites present in samples: " + totalCountsMap.size());
//filter out sites that appear at a frequency strictly less than the provided threshold
final AllelicCountCollection totalCounts = new AllelicCountCollection();
numberOfSamplesMap.entrySet().stream().filter(e -> e.getValue().doubleValue() / totalNumberOfSamples >= siteFrequencyThreshold).map(e -> totalCountsMap.get(e.getKey())).forEach(totalCounts::add);
logger.info(String.format("Number of unique sites present in samples above site frequency = %4.3f: %d", siteFrequencyThreshold, totalCounts.getCounts().size()));
return new AllelicPanelOfNormals(totalCounts);
}
Aggregations