use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class AddressBook method getCountsAsJson.
/* public JSONArray getCountsAsJson(Collection<EmailDocument> docs, String archiveID) {
return getCountsAsJson(docs, false */
/* we don't want to exceptOwner */
/*,archiveID);
}
*/
/**
* used primarily by correspondents.jsp
* // dumps the contacts in docs, and sorts according to sent/recd/mentions
* // returns an array of (json array of 5 elements:[name, in, out, mentions, url])
*/
public JSONArray getCountsAsJson(Collection<EmailDocument> docs, boolean exceptOwner, String archiveID) {
Contact ownContact = getContactForSelf();
List<Contact> allContacts = sortedContacts((Collection) docs);
Map<Contact, Integer> contactInCount = new LinkedHashMap<>(), contactOutCount = new LinkedHashMap<>(), contactMentionCount = new LinkedHashMap<>();
// compute counts
for (EmailDocument ed : docs) {
String senderEmail = ed.getFromEmailAddress();
Contact senderContact = this.lookupByEmail(senderEmail);
if (senderContact == null)
// should never happen, we should always have a sender contact
senderContact = ownContact;
int x = ed.sentOrReceived(this);
// message could be both sent and received
if ((x & EmailDocument.SENT_MASK) != 0) {
// this is a sent email, each to/cc/bcc gets +1 outcount.
// one of them could be own contact also.
Collection<Contact> toContacts = ed.getToCCBCCContacts(this);
for (Contact c : toContacts) {
Integer I = contactOutCount.get(c);
contactOutCount.put(c, (I == null) ? 1 : I + 1);
}
}
boolean received = // explicitly received
(x & EmailDocument.RECEIVED_MASK) != 0 || // its not explicitly sent, so we must count it as received by default
(x & EmailDocument.SENT_MASK) == 0;
if (received) {
// sender gets a +1 in count (could be ownContact also)
// all others get a mention count.
Integer I = contactInCount.get(senderContact);
contactInCount.put(senderContact, (I == null) ? 1 : I + 1);
}
if ((x & EmailDocument.SENT_MASK) == 0) {
// this message is not sent, its received.
// add mentions for everyone who's not me, who's on the to/cc/bcc of this message.
Collection<Contact> toContacts = ed.getToCCBCCContacts(this);
for (Contact c : toContacts) {
if (c == ownContact)
// doesn't seem to make sense to give a mention count for sender in addition to incount
continue;
Integer I = contactMentionCount.get(c);
contactMentionCount.put(c, (I == null) ? 1 : I + 1);
}
}
}
JSONArray resultArray = new JSONArray();
int count = 0;
for (Contact c : allContacts) {
if (c == ownContact && exceptOwner)
continue;
// out.println("<tr><td class=\"search\" title=\"" + c.toTooltip().replaceAll("\"", "").replaceAll("'", "") + "\">");
int contactId = getContactId(c);
// out.println ("<a style=\"text-decoration:none;color:inherit;\" href=\"browse?contact=" + contactId + "\">");
String bestNameForContact = c.pickBestName();
String url = "browse?adv-search=1&contact=" + contactId + "&archiveID=" + archiveID;
String nameToPrint = Util.escapeHTML(Util.ellipsize(bestNameForContact, 50));
Integer inCount = contactInCount.get(c), outCount = contactOutCount.get(c), mentionCount = contactMentionCount.get(c);
if (inCount == null)
inCount = 0;
if (outCount == null)
outCount = 0;
if (mentionCount == null)
mentionCount = 0;
JSONArray j = new JSONArray();
j.put(0, Util.escapeHTML(nameToPrint));
j.put(1, inCount);
j.put(2, outCount);
j.put(3, mentionCount);
j.put(4, url);
j.put(5, Util.escapeHTML(c.toTooltip()));
resultArray.put(count++, j);
// could consider putting another string which has more info about the contact such as all names and email addresses... this could be shown on hover
}
return resultArray;
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class AddressBook method main.
public static void main(String[] args) {
List<String> list = EmailUtils.parsePossibleNamesFromEmailAddress("mickey.mouse@disney.com");
System.out.println(Util.join(list, " "));
list = EmailUtils.parsePossibleNamesFromEmailAddress("donald_duck@disney.com");
System.out.println(Util.join(list, " "));
list = EmailUtils.parsePossibleNamesFromEmailAddress("70451.2444@compuserve.com");
System.out.println(Util.join(list, " "));
String ownerName = "Owner Name";
String ownerEmail = "owner@example.com";
{
AddressBook ab = new AddressBook(new String[] { ownerEmail }, new String[] { ownerName });
EmailDocument ed = new EmailDocument();
try {
ed.to = new Address[] { new InternetAddress("from@email.com", "From Last") };
ed.cc = new Address[] { new InternetAddress("cc@email.com", "CC Last") };
ed.to = new Address[] { new InternetAddress("to@example.com", "To Last") };
ed.from = new Address[] { new InternetAddress("from@example.com", "From Last") };
} catch (Exception e) {
Util.print_exception(e, log);
}
ab.processContactsFromMessage(ed);
// 4 addresses should be added + owner
Util.ASSERT(ab.size() == 5);
}
{
AddressBook ab = new AddressBook(new String[] { ownerEmail }, new String[] { ownerName });
EmailDocument ed1 = new EmailDocument(), ed2 = new EmailDocument();
try {
ed1.to = new Address[] { new InternetAddress("Merge Name", "mergename@example.com") };
ed1.from = new Address[] { new InternetAddress("Merge Name2", "mergename@example.com") };
ed2.to = new Address[] { new InternetAddress("Merge X Name", "mergeemail1@example.com") };
ed2.from = new Address[] { new InternetAddress("Merge X Name", "mergeemail2@example.com") };
} catch (Exception e) {
ab.processContactsFromMessage(ed1);
ab.processContactsFromMessage(ed2);
Util.ASSERT(ab.size() == 3);
}
// 2 names for this email address
Util.ASSERT(ab.lookupByEmail("mergename@example.com").getNames().size() == 2);
}
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class MuseEmailFetcher method fetchAndIndexEmails.
/**
* key method to fetch actual email messages. can take a long time.
* @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
* @param selectedFolders is in the format <account name>^-^<folder name>
* @param session is used only to put a status object in. can be null in which case status object is not set.
* emailDocs, addressBook and blobstore
* @throws NoDefaultFolderException
*/
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session) throws MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException {
setupFetchers(-1);
long startTime = System.currentTimeMillis();
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
boolean op_cancelled = false, out_of_mem = false;
BlobStore attachmentsStore = archive.getBlobStore();
fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
if (Util.nullOrEmpty(fetchers)) {
log.warn("Trying to fetch email with no fetchers, setup not called ?");
return;
}
setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
// one fetcher will aggregate everything
FetchStats stats = new FetchStats();
MTEmailFetcher aggregatingFetcher = null;
// a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
long startTimeMillis = System.currentTimeMillis();
for (MTEmailFetcher fetcher : fetchers) {
if (session != null)
session.setAttribute("statusProvider", fetcher);
fetcher.setArchive(archive);
fetcher.setFetchConfig(fetchConfig);
log.info("Memory status before fetching emails: " + Util.getMemoryStats());
// this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
// but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
if (fetcher.isCancelled()) {
log.info("NOTE: fetcher operation was cancelled");
op_cancelled = true;
break;
}
if (fetcher.mayHaveRunOutOfMemory()) {
log.warn("Fetcher operation ran out of memory " + fetcher);
out_of_mem = true;
break;
}
fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
// first non-empty fetcher
aggregatingFetcher = fetcher;
if (aggregatingFetcher != null)
aggregatingFetcher.merge(fetcher);
// add the indexed folders to the stats
EmailStore store = fetcher.getStore();
String fetcherDescription = store.displayName + ":" + store.emailAddress;
for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
}
if (op_cancelled)
throw new CancelledException();
if (out_of_mem)
throw new OutOfMemoryError();
if (aggregatingFetcher != null) {
stats.importStats = aggregatingFetcher.stats;
if (aggregatingFetcher.mayHaveRunOutOfMemory())
throw new OutOfMemoryError();
}
// save memory
aggregatingFetcher = null;
long endTimeMillis = System.currentTimeMillis();
long elapsedMillis = endTimeMillis - startTimeMillis;
log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
// note: this is all archive docs, not just the ones that may have been just imported
List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
archive.addFetchedFolderInfos(fetchedFolderInfos);
if (allEmailDocs.size() == 0)
log.warn("0 messages from email fetcher");
EmailUtils.cleanDates(allEmailDocs);
// create a new address book
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
log.info("Address book stats: " + addressBook.getStats());
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
archive.setAddressBook(addressBook);
// we shouldn't really have dups now because the archive ensures that only unique docs are added
// move sorting to archive.postprocess?
EmailUtils.removeDupsAndSort(allEmailDocs);
// report stats
stats.lastUpdate = new Date().getTime();
// (String) JSPHelper.getSessionAttribute(session, "userKey");
stats.userKey = "USER KEY UNUSED";
stats.fetchAndIndexTimeMillis = elapsedMillis;
updateStats(archive, addressBook, stats);
if (session != null)
session.removeAttribute("statusProvider");
log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class EntityBook method fillSummaryFields.
public void fillSummaryFields(Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap, Archive archive) {
JSONArray resultArray = new JSONArray();
// trick to use count (modifiable variable) inside for each.
final Integer[] count = { 0 };
summary_L1_entityCountMap.clear();
docsetmap.entrySet().forEach(entry -> {
count[0] = count[0] + 1;
Summary_L1 summary = new Summary_L1();
summary.score = entry.getValue().first;
summary.messages = entry.getValue().second;
// get date range
Collection<EmailDocument> emaildocs = summary.messages.stream().map(s -> (EmailDocument) s).collect(Collectors.toList());
Pair<Date, Date> daterange = EmailUtils.getFirstLast(emaildocs, true);
if (daterange == null) {
daterange = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
}
if (daterange.first == null)
daterange.first = archive.collectionMetadata.firstDate;
if (daterange.second == null)
daterange.second = archive.collectionMetadata.lastDate;
summary.startDate = daterange.first;
summary.endDate = daterange.second;
summary_L1_entityCountMap.put(entry.getKey(), summary);
String entity = entry.getKey().getDisplayName();
JSONArray j = new JSONArray();
Short etype = entityType;
Set<String> altNamesSet = entry.getKey().getAltNames();
String altNames = (altNamesSet == null) ? "" : "Alternate names: " + Util.join(altNamesSet, ";");
j.put(0, Util.escapeHTML(entity));
j.put(1, summary.score);
j.put(2, summary.messages.size());
j.put(3, altNames);
if (summary.startDate != null)
j.put(4, new SimpleDateFormat("MM/dd/yyyy").format(summary.startDate));
else
j.put(4, summary.startDate);
if (summary.endDate != null)
j.put(5, new SimpleDateFormat("MM/dd/yyyy").format(summary.endDate));
else
j.put(5, summary.endDate);
// add entity type as well..
j.put(6, NEType.getTypeForCode(entityType).getDisplayName());
resultArray.put(count[0] - 1, j);
});
summary_JSON = resultArray;
}
use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.
the class EntityFeature method checkIndex.
/**
* @arg2 force creation of index irrespective of previous existence of the
* index.
* Checks and creates index if required.
* @return true if successful
*/
private boolean checkIndex(Archive archive, boolean force) {
Boolean exists = indexExists(archive);
int c1 = 0, c2 = 0, c3 = 0;
int g1 = 0, g2 = 0, g3 = 0;
int f1 = 0, f2 = 0, f3 = 0;
boolean istatus = true;
if (force || (!exists)) {
Map<String, EntityFeature> features = new HashMap<>();
Collection<EmailDocument> docs = (Collection) archive.getAllDocs();
int totalEntities = 0;
log.info("No feature index found..., starting to process and index. This can take a while.");
int di = 0;
for (EmailDocument ed : docs) {
if (cancel) {
clean(archive);
return false;
}
if (di % 1000 == 0) {
JSPHelper.log.info("Done analysing documents: " + di + " of: " + docs.size());
status = "Analyzed " + di + "/" + docs.size() + " email documents";
pctComplete = ((double) di * 50) / (double) docs.size();
}
di++;
List<Span> names;
try {
names = Arrays.asList(archive.getAllNamesInDoc(ed, true));
} catch (IOException ioe) {
log.error("Problem accessing entities in " + ed.getUniqueId(), ioe);
continue;
}
List<String> entities = names.stream().filter(n -> n.type == NEType.Type.PERSON.getCode()).map(n -> n.text).collect(Collectors.toList());
List<String> places = names.stream().filter(n -> n.type == NEType.Type.PLACE.getCode()).map(n -> n.text).collect(Collectors.toList());
List<String> orgs = names.stream().filter(n -> n.type == NEType.Type.ORGANISATION.getCode()).map(n -> n.text).collect(Collectors.toList());
if (entities != null)
c1 += entities.size();
if (orgs != null)
c2 += orgs.size();
if (places != null)
c3 += places.size();
Map<String, String> goodNames = new HashMap<>();
List<String> correspondents = ed.getAllNames();
List<String> addresses = ed.getAllAddrs();
if (correspondents != null)
for (String c : correspondents) {
if (c != null && c.contains(" ")) {
// EmailUtils.normalizePersonNameForLookup(c);
String n = IndexUtils.canonicalizeEntity(c);
goodNames.put(n, "person");
}
}
for (String e : entities) {
if (e != null && e.contains(" ")) {
String canonicalEntity = IndexUtils.canonicalizeEntity(e);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "person");
g1++;
}
}
for (String o : orgs) {
String canonicalEntity = IndexUtils.canonicalizeEntity(o);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "org");
g2++;
}
for (String p : places) {
String canonicalEntity = IndexUtils.canonicalizeEntity(p);
if (canonicalEntity == null)
continue;
goodNames.put(canonicalEntity, "places");
g3++;
}
// O(goodNames.size())
for (String gn : goodNames.keySet()) {
if (features.get(gn) == null) {
if (goodNames.get(gn).equals("person")) {
features.put(gn, new EntityFeature(gn, EntityFeature.PERSON));
f1++;
} else if (goodNames.get(gn).equals("org")) {
features.put(gn, new EntityFeature(gn, EntityFeature.ORG));
f2++;
} else if (goodNames.get(gn).equals("places")) {
features.put(gn, new EntityFeature(gn, EntityFeature.PLACE));
f3++;
}
}
features.get(gn).accountForThis();
features.get(gn).addAllCE(goodNames.keySet());
if (addresses != null)
features.get(gn).addAllEA(addresses);
features.get(gn).priorProbablity = features.get(gn).priorProbablity + 1.0;
totalEntities++;
}
}
log.info("Found: " + c1 + " entities, " + c2 + " orgs and " + c3 + " places");
log.info("Gn: " + g1 + " entities, " + g2 + " orgs and " + g3 + " places");
log.info("Found goodfeatures: " + f1 + " entities, " + f2 + " orgs and " + f3 + " places");
for (String key : features.keySet()) features.get(key).priorProbablity = features.get(key).priorProbablity / (double) totalEntities;
log.info("Done analysing docs. Starting to index.");
istatus = index(features, archive);
}
return istatus;
}
Aggregations