use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class EntityBook method fillSummaryFields.
public void fillSummaryFields(Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap, Archive archive) {
JSONArray resultArray = new JSONArray();
// trick to use count (modifiable variable) inside for each.
final Integer[] count = { 0 };
summary_L1_entityCountMap.clear();
docsetmap.entrySet().forEach(entry -> {
count[0] = count[0] + 1;
Summary_L1 summary = new Summary_L1();
summary.score = entry.getValue().first;
summary.messages = entry.getValue().second;
// get date range
Collection<EmailDocument> emaildocs = summary.messages.stream().map(s -> (EmailDocument) s).collect(Collectors.toList());
Pair<Date, Date> daterange = EmailUtils.getFirstLast(emaildocs, true);
if (daterange == null) {
daterange = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
}
if (daterange.first == null)
daterange.first = archive.collectionMetadata.firstDate;
if (daterange.second == null)
daterange.second = archive.collectionMetadata.lastDate;
summary.startDate = daterange.first;
summary.endDate = daterange.second;
summary_L1_entityCountMap.put(entry.getKey(), summary);
String entity = entry.getKey().getDisplayName();
JSONArray j = new JSONArray();
Short etype = entityType;
Set<String> altNamesSet = entry.getKey().getAltNames();
String altNames = (altNamesSet == null) ? "" : "Alternate names: " + Util.join(altNamesSet, ";");
j.put(0, Util.escapeHTML(entity));
j.put(1, summary.score);
j.put(2, summary.messages.size());
j.put(3, altNames);
if (summary.startDate != null)
j.put(4, new SimpleDateFormat("MM/dd/yyyy").format(summary.startDate));
else
j.put(4, summary.startDate);
if (summary.endDate != null)
j.put(5, new SimpleDateFormat("MM/dd/yyyy").format(summary.endDate));
else
j.put(5, summary.endDate);
// add entity type as well..
j.put(6, NEType.getTypeForCode(entityType).getDisplayName());
resultArray.put(count[0] - 1, j);
});
summary_JSON = resultArray;
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class Filter method matches.
public boolean matches(Document d) {
if (d instanceof DatedDocument) {
DatedDocument dd = (DatedDocument) d;
if (!matchesDate(dd))
return false;
}
if (keywords != null && keywords.size() > 0) {
log.warn("Filtering by keywords during fetch&index is currently disabled");
Util.softAssert(false, log);
// String s = d.getContents().toLowerCase();
// // check for all keywords, if any absent return false
// for (String keyword: keywords)
// if (s.indexOf(keyword) < 0)
// return false;
}
// extra checks for email doc
if (d instanceof EmailDocument) {
// check if any of the people involved in this message are one of personContacts
EmailDocument ed = (EmailDocument) d;
if (personContacts.size() > 0) {
// if this is the case, we will explicitly apply the filter again, so its ok.
if (addressBook != null) {
List<String> list = ed.getAllAddrs();
Set<Contact> contactsInThisMessage = new LinkedHashSet<>();
for (String s : list) {
Contact c = addressBook.lookupByEmail(s);
if (c != null)
contactsInThisMessage.add(c);
}
contactsInThisMessage.retainAll(personContacts);
if (contactsInThisMessage.size() == 0)
return false;
}
}
if (sentMessagesOnly) {
if (ownContact != null) {
String fromEmail = ed.getFromEmailAddress();
Set<String> ownAddrs = ownContact.getEmails();
if (!ownAddrs.contains(fromEmail))
return false;
} else {
log.warn("WARNING: user error: trying to use sent-only option without setting user's own contact info");
// in this case, we assume a match implicitly because we don't want to filter out all messages
}
}
}
return true;
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class AddressBook method sortedContactsAndCounts.
/**
* returns a list of all contacts in the given collection of docs, sorted by outgoing freq.
*/
List<Pair<Contact, Integer>> sortedContactsAndCounts(Collection<EmailDocument> docs) {
Map<Contact, Integer> contactToCount = new LinkedHashMap<>();
// we'll also count the recipient twice if he sends a message to himself
for (EmailDocument ed : docs) {
List<String> allEmails = ed.getAllAddrs();
for (String email : allEmails) {
Contact c = lookupByEmail(email);
if (c != null) {
Integer I = contactToCount.get(c);
contactToCount.put(c, (I == null) ? 1 : I + 1);
}
}
}
return Util.sortMapByValue(contactToCount);
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class AddressBook method getCountsAsJSON.
/*
THis method fills in a JSON Array from the summary object that was updated during the last change in the addressbook (loading, recomputing, fresh build).
*/
public JSONArray getCountsAsJSON(boolean exceptOwner, String archiveID) {
JSONArray resultArray = new JSONArray();
Archive archive = ArchiveReaderWriter.getArchiveForArchiveID(archiveID);
List<Contact> allContacts = allContacts();
Contact ownContact = getContactForSelf();
int count = 0;
for (Contact c : allContacts) {
if (c == ownContact && exceptOwner)
continue;
// out.println("<tr><td class=\"search\" title=\"" + c.toTooltip().replaceAll("\"", "").replaceAll("'", "") + "\">");
int contactId = getContactId(c);
// out.println ("<a style=\"text-decoration:none;color:inherit;\" href=\"browse?contact=" + contactId + "\">");
String bestNameForContact = c.pickBestName();
String url = "browse?adv-search=1&contact=" + contactId + "&archiveID=" + archiveID;
String nameToPrint = Util.escapeHTML(Util.ellipsize(bestNameForContact, 50));
Integer recvCount = L1_Summary_ReceivedDocs.getOrDefault(c, new LinkedHashSet<>()).size(), sentCount = L1_Summary_SentDocs.getOrDefault(c, new LinkedHashSet<>()).size();
// mentionCount = L1_Summary_contactMentionDocs.getOrDefault(c,new LinkedHashSet<>()).size();
Integer recvFromOwnerCount = L1_Summary_RecFrmOwnerDocs.getOrDefault(c, new LinkedHashSet<>()).size();
/*if (inCount == null)
inCount = 0;
if (outCount == null)
outCount = 0;
if (mentionCount == null)
mentionCount = 0;*/
Set<EmailDocument> alldocs = Util.setUnion(L1_Summary_SentDocs.getOrDefault(c, new LinkedHashSet<>()), L1_Summary_ReceivedDocs.getOrDefault(c, new LinkedHashSet<>()));
alldocs = Util.setUnion(alldocs, L1_Summary_RecFrmOwnerDocs.getOrDefault(c, new LinkedHashSet<>()));
Pair<Date, Date> range = EmailUtils.getFirstLast(alldocs, true);
JSONArray j = new JSONArray();
j.put(0, Util.escapeHTML(nameToPrint));
// j.put(1, totalCount);
j.put(1, sentCount);
j.put(2, recvCount);
j.put(3, recvFromOwnerCount);
j.put(4, url);
j.put(5, Util.escapeHTML(c.toTooltip()));
if (range.first != null)
j.put(6, new SimpleDateFormat("MM/dd/yyyy").format(range.first));
else
j.put(6, range.first);
if (range.second != null)
j.put(7, new SimpleDateFormat("MM/dd/yyyy").format(range.second));
else
j.put(7, range.second);
resultArray.put(count++, j);
// could consider putting another string which has more info about the contact such as all names and email addresses... this could be shown on hover
}
return resultArray;
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class MuseEmailFetcher method updateStats.
/**
* this should probably move to archive.java
*/
private void updateStats(Archive archive, AddressBook addressBook, FetchStats stats) {
Collection<EmailDocument> allEmailDocs = (Collection) archive.getAllDocs();
// the rest of this is basically stats collection
int nSent = 0, nReceived = 0;
for (EmailDocument ed : allEmailDocs) {
Pair<Boolean, Boolean> p = addressBook.isSentOrReceived(ed.getToCCBCC(), ed.from);
boolean sent = p.getFirst();
boolean received = p.getSecond();
if (sent)
nSent++;
if (received)
nReceived++;
}
stats.dataErrors = getDataErrors();
stats.nMessagesInArchive = allEmailDocs.size();
/* compute stats for time range */
if (allEmailDocs.size() > 0) {
Pair<Date, Date> p = EmailUtils.getFirstLast(allEmailDocs);
stats.firstMessageDate = p.getFirst() == null ? 0 : p.getFirst().getTime();
stats.lastMessageDate = p.getSecond() == null ? 0 : p.getSecond().getTime();
}
// add stat for the duplicate messages that is stored in dupMessageInfo field of archive and is filled by MuseEmailFetcher while fetching messages..
// the errors of duplicates need to be properly formatted using the map dupmessageinfo
long sizeSavedFromDupMessages = 0;
long sizeSavedFromDupAttachments = 0;
Collection<String> dupMessages = new LinkedHashSet<>();
for (Document doc : archive.getDupMessageInfo().keySet()) {
EmailDocument edoc = (EmailDocument) doc;
StringBuilder sb = new StringBuilder();
long sizesaved = 0;
long totalsize = 0;
// number of duplicates found for this emaildocument
int numofduplicates = archive.getDupMessageInfo().get(doc).size();
// get the size of attachments
sb.append("Duplicate message:" + " Following messages were found as duplicates of\n message id #" + edoc.getUniqueId() + " (" + edoc.folderName + "):\n");
for (Blob b : edoc.attachments) {
totalsize += b.size;
}
sizesaved = (numofduplicates) * totalsize;
int count = 1;
for (Tuple2 s : archive.getDupMessageInfo().get(doc)) {
sb.append(" " + count + ". " + "Message id # " + s.getSecond() + " (" + s.getFirst() + ")\n");
count++;
}
if (sizesaved != 0) {
sb.append("***** Saved " + sizesaved + " bytes by detecting these duplicates\n");
sizeSavedFromDupMessages += sizesaved;
}
dupMessages.add(sb.toString());
}
stats.dataErrors.addAll(dupMessages);
// also add stat for blobstore
Collection<String> dupBlobMessages = new LinkedHashSet<>();
Map<Blob, Integer> dupblobs = archive.getBlobStore().getDupBlobCount();
if (dupblobs.size() > 0) {
for (Blob b : dupblobs.keySet()) {
dupBlobMessages.add("Duplicate attachments:" + dupblobs.get(b) + " duplicate attachments found of " + archive.getBlobStore().full_filename_normalized(b) + ". Total space saved by not storing these duplicates is " + dupblobs.get(b) * b.size + " bytes\n");
sizeSavedFromDupAttachments += dupblobs.get(b) * b.size;
}
}
stats.dataErrors.addAll(dupBlobMessages);
stats.spaceSavingFromDupMessageDetection = sizeSavedFromDupMessages / 1000;
stats.spaceSavingFromDupAttachmentDetection = sizeSavedFromDupAttachments / 1000;
// stats.dataErrors.add("Space saving from duplicate detection:" +sizeSavedFromDupMessages/1000 + "KB saved by detecting duplicate messages\n");
// stats.dataErrors.add("Space saving from duplicate detection:" +sizeSavedFromDupAttachments/1000 + "KB saved by detecting duplicate attachments\n");
archive.addStats(stats);
log.info("Fetcher stats: " + stats);
}
Aggregations