use of org.apache.commons.math3.stat.descriptive.SummaryStatistics in project GDSC-SMLM by aherbert.
the class HysteresisFilter method getSearchDistanceUsingCandidates.
/**
* Find average precision of the candidates and use it for the search
* distance
*
* @param peakResults
* @param candidates
* @return
*/
private double getSearchDistanceUsingCandidates(MemoryPeakResults peakResults, LinkedList<PeakResult> candidates) {
SummaryStatistics stats = new SummaryStatistics();
final double nmPerPixel = peakResults.getNmPerPixel();
final double gain = peakResults.getGain();
final boolean emCCD = peakResults.isEMCCD();
for (PeakResult peakResult : candidates) {
stats.addValue(peakResult.getPrecision(nmPerPixel, gain, emCCD));
}
double distanceThreshold = stats.getMean() * searchDistance / nmPerPixel;
return distanceThreshold;
}
use of org.apache.commons.math3.stat.descriptive.SummaryStatistics in project tika by apache.
the class LuceneTokenCounter method count.
void count(String field) throws IOException {
long tokenCount = leafReader.getSumTotalTermFreq(field);
if (tokenCount > Integer.MAX_VALUE) {
throw new IllegalArgumentException("can't handle longs");
}
int tokenCountInt = (int) tokenCount;
int uniqueTokenCount = 0;
SummaryStatistics summStats = new SummaryStatistics();
double ent = 0.0d;
double p = 0.0d;
double base = 2.0;
Terms terms = leafReader.terms(field);
if (terms == null) {
//if there were no terms
fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats));
return;
}
TermsEnum termsEnum = terms.iterator();
BytesRef bytesRef = termsEnum.next();
TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
while (bytesRef != null) {
long termFreq = termsEnum.totalTermFreq();
if (termFreq > Integer.MAX_VALUE) {
throw new IllegalArgumentException("Sorry can't handle longs yet");
}
int tf = (int) termFreq;
//TODO: figure out how to avoid Stringifying this
//to get codepoint count
String t = bytesRef.utf8ToString();
int len = t.codePointCount(0, t.length());
for (int i = 0; i < tf; i++) {
summStats.addValue(len);
}
p = (double) tf / (double) tokenCount;
ent += p * FastMath.log(base, p);
if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) {
queue.insertWithOverflow(new TokenIntPair(t, tf));
}
uniqueTokenCount++;
bytesRef = termsEnum.next();
}
if (tokenCountInt > 0) {
ent = (-1.0d / (double) tokenCountInt) * ent;
}
fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats));
}
use of org.apache.commons.math3.stat.descriptive.SummaryStatistics in project tika by apache.
the class TokenStatistics method equals.
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
TokenStatistics that = (TokenStatistics) o;
if (totalTokens != that.totalTokens)
return false;
if (totalUniqueTokens != that.totalUniqueTokens)
return false;
if (!doubleEquals(that.entropy, entropy))
return false;
// Probably incorrect - comparing Object[] arrays with Arrays.equals
if (!Arrays.equals(topN, that.topN))
return false;
SummaryStatistics thatS = ((TokenStatistics) o).summaryStatistics;
if (summaryStatistics.getN() != thatS.getN())
return false;
//if both have n==0, don't bother with the stats
if (summaryStatistics.getN() == 0L)
return true;
//TODO: consider adding others...
if (!doubleEquals(summaryStatistics.getGeometricMean(), thatS.getGeometricMean()))
return false;
if (!doubleEquals(summaryStatistics.getMax(), thatS.getMax()))
return false;
if (!doubleEquals(summaryStatistics.getMean(), thatS.getMean()))
return false;
if (!doubleEquals(summaryStatistics.getMin(), thatS.getMin()))
return false;
if (!doubleEquals(summaryStatistics.getSum(), thatS.getSum()))
return false;
if (!doubleEquals(summaryStatistics.getStandardDeviation(), thatS.getStandardDeviation()))
return false;
return true;
}
use of org.apache.commons.math3.stat.descriptive.SummaryStatistics in project tika by apache.
the class AbstractProfiler method writeContentData.
/**
* Checks to see if metadata is null or content is empty (null or only whitespace).
* If any of these, then this does no processing, and the fileId is not
* entered into the content table.
*
* @param fileId
* @param m
* @param fieldName
* @param contentsTable
*/
protected void writeContentData(String fileId, Metadata m, String fieldName, TableInfo contentsTable) throws IOException {
if (m == null) {
return;
}
Map<Cols, String> data = new HashMap<>();
String content = getContent(m, maxContentLength, data);
if (content == null || content.trim().length() == 0) {
return;
}
tokenCounter.clear(fieldName);
tokenCounter.add(fieldName, content);
data.put(Cols.ID, fileId);
data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
langid(m, data);
String langid = data.get(Cols.LANG_ID_1);
langid = (langid == null) ? "" : langid;
writeTokenCounts(data, fieldName, tokenCounter);
CommonTokenResult commonTokenResult = null;
try {
commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, tokenCounter.getTokens(fieldName));
} catch (IOException e) {
LOG.error("{}", e.getMessage(), e);
}
data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens()));
data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens()));
data.put(Cols.NUM_ALPHABETIC_TOKENS, Integer.toString(commonTokenResult.getAlphabeticTokens()));
data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy()));
SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
data.put(Cols.TOKEN_LENGTH_MEAN, Double.toString(summStats.getMean()));
data.put(Cols.TOKEN_LENGTH_STD_DEV, Double.toString(summStats.getStandardDeviation()));
unicodeBlocks(m, data);
try {
writer.writeRow(contentsTable, data);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
use of org.apache.commons.math3.stat.descriptive.SummaryStatistics in project cassandra by apache.
the class BootStrapperTest method allocateTokensForNode.
private void allocateTokensForNode(int vn, String ks, TokenMetadata tm, InetAddress addr) {
SummaryStatistics os = TokenAllocation.replicatedOwnershipStats(tm.cloneOnlyTokenMap(), Keyspace.open(ks).getReplicationStrategy(), addr);
Collection<Token> tokens = BootStrapper.allocateTokens(tm, addr, ks, vn, 0);
assertEquals(vn, tokens.size());
tm.updateNormalTokens(tokens, addr);
SummaryStatistics ns = TokenAllocation.replicatedOwnershipStats(tm.cloneOnlyTokenMap(), Keyspace.open(ks).getReplicationStrategy(), addr);
verifyImprovement(os, ns);
}
Aggregations