use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class EntityBook method getDisplayNameToFreq.
public Map<String, Integer> getDisplayNameToFreq(Archive archive, short type) {
Map<String, Entity> displayNameToEntity = new LinkedHashMap();
double theta = 0.001;
EntityBook entityBook = archive.getEntityBook();
for (Document doc : archive.getAllDocs()) {
Span[] spans = archive.getEntitiesInDoc(doc, true);
Set<String> seenInThisDoc = new LinkedHashSet<>();
for (Span span : spans) {
// bail out if not of entity type that we're looking for, or not enough confidence
if (span.type != type || span.typeScore < theta)
continue;
String name = span.getText();
String displayName = name;
// map the name to its display name. if no mapping, we should get the same name back as its displayName
if (entityBook != null)
displayName = entityBook.getDisplayName(name, span.type);
displayName = displayName.trim();
if (seenInThisDoc.contains(displayName))
// count an entity in a doc only once
continue;
seenInThisDoc.add(displayName);
if (!displayNameToEntity.containsKey(displayName))
displayNameToEntity.put(displayName, new Entity(displayName, span.typeScore));
else
displayNameToEntity.get(displayName).freq++;
}
}
// convert from displayNameToEntity to displayNameToFreq
Map<String, Integer> displayNameToFreq = new LinkedHashMap<>();
for (Entity e : displayNameToEntity.values()) displayNameToFreq.put(e.entity, e.freq);
return displayNameToFreq;
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class EntityBookManager method fillEntityBookFromLucene.
/*
This is a slow path but the assumption is that it must be used only once when porting the old archives (where entitybooks are not factored out as files). After that only the other
path 'fillEntityBookFromText' will be used repetitively (when loading the archive)
*/
private void fillEntityBookFromLucene(Short type) {
EntityBook ebook = new EntityBook(type);
mTypeToEntityBook.put(type, ebook);
double theta = 0.001;
// docset map maps a mappedentity to it's score and the set of documents.
Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
for (Document doc : mArchive.getAllDocs()) {
Span[] spansbody = getEntitiesInDocFromLucene(doc, true);
Span[] spans = getEntitiesInDocFromLucene(doc, false);
Span[] allspans = ArrayUtils.addAll(spans, spansbody);
Set<String> seenInThisDoc = new LinkedHashSet<>();
for (Span span : allspans) {
// bail out if not of entity type that we're looking for, or not enough confidence
if (span.type != type || span.typeScore < theta)
continue;
String name = span.getText();
String canonicalizedname = EntityBook.canonicalize(name);
Double score = new Double(span.typeScore);
// map the name to its display name. if no mapping, we should get the same name back as its displayName
MappedEntity mappedEntity = (ebook.nameToMappedEntity.get(canonicalizedname));
if (mappedEntity == null) {
// add this name as a mapped entity in the entiybook.
mappedEntity = new MappedEntity();
// Don't canonicalize for the display purpose otherwise 'University of Florida' becomes 'florida of university'
mappedEntity.setDisplayName(name);
mappedEntity.setEntityType(type);
mappedEntity.addAltNames(name);
ebook.nameToMappedEntity.put(canonicalizedname, mappedEntity);
Set<Document> docset = new LinkedHashSet<>();
docsetmap.put(mappedEntity, new Pair(score, docset));
// No doc exists already for this mappedntity
docset.add(doc);
} else {
// add it in the docset.//what about the score??? For now take the score as max of all scores..
Double oldscore = docsetmap.get(mappedEntity).first;
Double finalscore = Double.max(oldscore, score);
Set<Document> docset = docsetmap.get(mappedEntity).second;
docset.add(doc);
docsetmap.put(mappedEntity, new Pair(finalscore, docset));
}
}
}
// fill cache summary for ebook in other fields of ebook.
ebook.fillSummaryFields(docsetmap, mArchive);
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class EntityBook method fillSummaryFields.
public void fillSummaryFields(Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap, Archive archive) {
JSONArray resultArray = new JSONArray();
// trick to use count (modifiable variable) inside for each.
final Integer[] count = { 0 };
summary_L1_entityCountMap.clear();
docsetmap.entrySet().forEach(entry -> {
count[0] = count[0] + 1;
Summary_L1 summary = new Summary_L1();
summary.score = entry.getValue().first;
summary.messages = entry.getValue().second;
// get date range
Collection<EmailDocument> emaildocs = summary.messages.stream().map(s -> (EmailDocument) s).collect(Collectors.toList());
Pair<Date, Date> daterange = EmailUtils.getFirstLast(emaildocs, true);
if (daterange == null) {
daterange = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
}
if (daterange.first == null)
daterange.first = archive.collectionMetadata.firstDate;
if (daterange.second == null)
daterange.second = archive.collectionMetadata.lastDate;
summary.startDate = daterange.first;
summary.endDate = daterange.second;
summary_L1_entityCountMap.put(entry.getKey(), summary);
String entity = entry.getKey().getDisplayName();
JSONArray j = new JSONArray();
Short etype = entityType;
Set<String> altNamesSet = entry.getKey().getAltNames();
String altNames = (altNamesSet == null) ? "" : "Alternate names: " + Util.join(altNamesSet, ";");
j.put(0, Util.escapeHTML(entity));
j.put(1, summary.score);
j.put(2, summary.messages.size());
j.put(3, altNames);
if (summary.startDate != null)
j.put(4, new SimpleDateFormat("MM/dd/yyyy").format(summary.startDate));
else
j.put(4, summary.startDate);
if (summary.endDate != null)
j.put(5, new SimpleDateFormat("MM/dd/yyyy").format(summary.endDate));
else
j.put(5, summary.endDate);
// add entity type as well..
j.put(6, NEType.getTypeForCode(entityType).getDisplayName());
resultArray.put(count[0] - 1, j);
});
summary_JSON = resultArray;
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class MuseEmailFetcher method updateStats.
/**
* this should probably move to archive.java
*/
private void updateStats(Archive archive, AddressBook addressBook, FetchStats stats) {
Collection<EmailDocument> allEmailDocs = (Collection) archive.getAllDocs();
// the rest of this is basically stats collection
int nSent = 0, nReceived = 0;
for (EmailDocument ed : allEmailDocs) {
Pair<Boolean, Boolean> p = addressBook.isSentOrReceived(ed.getToCCBCC(), ed.from);
boolean sent = p.getFirst();
boolean received = p.getSecond();
if (sent)
nSent++;
if (received)
nReceived++;
}
stats.dataErrors = getDataErrors();
stats.nMessagesInArchive = allEmailDocs.size();
/* compute stats for time range */
if (allEmailDocs.size() > 0) {
Pair<Date, Date> p = EmailUtils.getFirstLast(allEmailDocs);
stats.firstMessageDate = p.getFirst() == null ? 0 : p.getFirst().getTime();
stats.lastMessageDate = p.getSecond() == null ? 0 : p.getSecond().getTime();
}
// add stat for the duplicate messages that is stored in dupMessageInfo field of archive and is filled by MuseEmailFetcher while fetching messages..
// the errors of duplicates need to be properly formatted using the map dupmessageinfo
long sizeSavedFromDupMessages = 0;
long sizeSavedFromDupAttachments = 0;
Collection<String> dupMessages = new LinkedHashSet<>();
for (Document doc : archive.getDupMessageInfo().keySet()) {
EmailDocument edoc = (EmailDocument) doc;
StringBuilder sb = new StringBuilder();
long sizesaved = 0;
long totalsize = 0;
// number of duplicates found for this emaildocument
int numofduplicates = archive.getDupMessageInfo().get(doc).size();
// get the size of attachments
sb.append("Duplicate message:" + " Following messages were found as duplicates of\n message id #" + edoc.getUniqueId() + " (" + edoc.folderName + "):\n");
for (Blob b : edoc.attachments) {
totalsize += b.size;
}
sizesaved = (numofduplicates) * totalsize;
int count = 1;
for (Tuple2 s : archive.getDupMessageInfo().get(doc)) {
sb.append(" " + count + ". " + "Message id # " + s.getSecond() + " (" + s.getFirst() + ")\n");
count++;
}
if (sizesaved != 0) {
sb.append("***** Saved " + sizesaved + " bytes by detecting these duplicates\n");
sizeSavedFromDupMessages += sizesaved;
}
dupMessages.add(sb.toString());
}
stats.dataErrors.addAll(dupMessages);
// also add stat for blobstore
Collection<String> dupBlobMessages = new LinkedHashSet<>();
Map<Blob, Integer> dupblobs = archive.getBlobStore().getDupBlobCount();
if (dupblobs.size() > 0) {
for (Blob b : dupblobs.keySet()) {
dupBlobMessages.add("Duplicate attachments:" + dupblobs.get(b) + " duplicate attachments found of " + archive.getBlobStore().full_filename_normalized(b) + ". Total space saved by not storing these duplicates is " + dupblobs.get(b) * b.size + " bytes\n");
sizeSavedFromDupAttachments += dupblobs.get(b) * b.size;
}
}
stats.dataErrors.addAll(dupBlobMessages);
stats.spaceSavingFromDupMessageDetection = sizeSavedFromDupMessages / 1000;
stats.spaceSavingFromDupAttachmentDetection = sizeSavedFromDupAttachments / 1000;
// stats.dataErrors.add("Space saving from duplicate detection:" +sizeSavedFromDupMessages/1000 + "KB saved by detecting duplicate messages\n");
// stats.dataErrors.add("Space saving from duplicate detection:" +sizeSavedFromDupAttachments/1000 + "KB saved by detecting duplicate attachments\n");
archive.addStats(stats);
log.info("Fetcher stats: " + stats);
}
use of edu.stanford.muse.index.Document in project epadd by ePADD.
the class ProperNounLinker method main.
public static void main(String[] args) {
// BOWtest();
// test();
Random rand = new Random();
try {
String userDir = System.getProperty("user.home") + File.separator + "epadd-appraisal" + File.separator + "user";
Archive archive = SimpleSessions.readArchiveIfPresent(userDir);
// findMerges(archive);
// SimpleSessions.saveArchive(archive.baseDir, "default", archive);
List<Document> docs = archive.getAllDocs();
long st = System.currentTimeMillis();
int numQ = 0;
for (int i = 0; i < 5; i++) {
Document doc = docs.get(rand.nextInt(docs.size()));
Span[] es = NER.getNames(doc, true, archive);
Arrays.asList(es).stream().filter(s -> !s.text.contains(" ")).forEach(s -> System.out.println(s.text + "<->" + getNearestMatches(new EmailMention(s, doc, new EmailHierarchy()), 5, archive)));
numQ += Arrays.asList(es).stream().filter(s -> !s.text.contains(" ")).count();
}
System.out.println("NumQ:" + numQ + "- Time: " + (System.currentTimeMillis() - st) + "ms" + "- AVG: " + ((float) (System.currentTimeMillis() - st) / numQ) + "ms");
} catch (Exception e) {
e.printStackTrace();
}
}
Aggregations