use of edu.stanford.muse.ner.Entity in project epadd by ePADD.
the class EntityBook method getDisplayNameToFreq.
public Map<String, Integer> getDisplayNameToFreq(Archive archive, short type) {
Map<String, Entity> displayNameToEntity = new LinkedHashMap();
double theta = 0.001;
EntityBook entityBook = archive.getEntityBook();
for (Document doc : archive.getAllDocs()) {
Span[] spans = archive.getEntitiesInDoc(doc, true);
Set<String> seenInThisDoc = new LinkedHashSet<>();
for (Span span : spans) {
// bail out if not of entity type that we're looking for, or not enough confidence
if (span.type != type || span.typeScore < theta)
continue;
String name = span.getText();
String displayName = name;
// map the name to its display name. if no mapping, we should get the same name back as its displayName
if (entityBook != null)
displayName = entityBook.getDisplayName(name, span.type);
displayName = displayName.trim();
if (seenInThisDoc.contains(displayName))
// count an entity in a doc only once
continue;
seenInThisDoc.add(displayName);
if (!displayNameToEntity.containsKey(displayName))
displayNameToEntity.put(displayName, new Entity(displayName, span.typeScore));
else
displayNameToEntity.get(displayName).freq++;
}
}
// convert from displayNameToEntity to displayNameToFreq
Map<String, Integer> displayNameToFreq = new LinkedHashMap<>();
for (Entity e : displayNameToEntity.values()) displayNameToFreq.put(e.entity, e.freq);
return displayNameToFreq;
}
use of edu.stanford.muse.ner.Entity in project epadd by ePADD.
the class EntityBook method fillSummaryFields.
public void fillSummaryFields(Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap, Archive archive) {
JSONArray resultArray = new JSONArray();
// trick to use count (modifiable variable) inside for each.
final Integer[] count = { 0 };
summary_L1_entityCountMap.clear();
docsetmap.entrySet().forEach(entry -> {
count[0] = count[0] + 1;
Summary_L1 summary = new Summary_L1();
summary.score = entry.getValue().first;
summary.messages = entry.getValue().second;
// get date range
Collection<EmailDocument> emaildocs = summary.messages.stream().map(s -> (EmailDocument) s).collect(Collectors.toList());
Pair<Date, Date> daterange = EmailUtils.getFirstLast(emaildocs, true);
if (daterange == null) {
daterange = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
}
if (daterange.first == null)
daterange.first = archive.collectionMetadata.firstDate;
if (daterange.second == null)
daterange.second = archive.collectionMetadata.lastDate;
summary.startDate = daterange.first;
summary.endDate = daterange.second;
summary_L1_entityCountMap.put(entry.getKey(), summary);
String entity = entry.getKey().getDisplayName();
JSONArray j = new JSONArray();
Short etype = entityType;
Set<String> altNamesSet = entry.getKey().getAltNames();
String altNames = (altNamesSet == null) ? "" : "Alternate names: " + Util.join(altNamesSet, ";");
j.put(0, Util.escapeHTML(entity));
j.put(1, summary.score);
j.put(2, summary.messages.size());
j.put(3, altNames);
if (summary.startDate != null)
j.put(4, new SimpleDateFormat("MM/dd/yyyy").format(summary.startDate));
else
j.put(4, summary.startDate);
if (summary.endDate != null)
j.put(5, new SimpleDateFormat("MM/dd/yyyy").format(summary.endDate));
else
j.put(5, summary.endDate);
// add entity type as well..
j.put(6, NEType.getTypeForCode(entityType).getDisplayName());
resultArray.put(count[0] - 1, j);
});
summary_JSON = resultArray;
}
Aggregations