Search in sources :

Example 1 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class AddressBook method getCountsAsJson.

/*  public JSONArray getCountsAsJson(Collection<EmailDocument> docs, String archiveID) {
        return getCountsAsJson(docs, false */
/* we don't want to exceptOwner */
/*,archiveID);
    }
*/
/**
 * used primarily by correspondents.jsp
 * // dumps the contacts in docs, and sorts according to sent/recd/mentions
 * // returns an array of (json array of 5 elements:[name, in, out, mentions, url])
 */
public JSONArray getCountsAsJson(Collection<EmailDocument> docs, boolean exceptOwner, String archiveID) {
    Contact ownContact = getContactForSelf();
    List<Contact> allContacts = sortedContacts((Collection) docs);
    Map<Contact, Integer> contactInCount = new LinkedHashMap<>(), contactOutCount = new LinkedHashMap<>(), contactMentionCount = new LinkedHashMap<>();
    // compute counts
    for (EmailDocument ed : docs) {
        String senderEmail = ed.getFromEmailAddress();
        Contact senderContact = this.lookupByEmail(senderEmail);
        if (senderContact == null)
            // should never happen, we should always have a sender contact
            senderContact = ownContact;
        int x = ed.sentOrReceived(this);
        // message could be both sent and received
        if ((x & EmailDocument.SENT_MASK) != 0) {
            // this is a sent email, each to/cc/bcc gets +1 outcount.
            // one of them could be own contact also.
            Collection<Contact> toContacts = ed.getToCCBCCContacts(this);
            for (Contact c : toContacts) {
                Integer I = contactOutCount.get(c);
                contactOutCount.put(c, (I == null) ? 1 : I + 1);
            }
        }
        boolean received = // explicitly received
        (x & EmailDocument.RECEIVED_MASK) != 0 || // its not explicitly sent, so we must count it as received by default
        (x & EmailDocument.SENT_MASK) == 0;
        if (received) {
            // sender gets a +1 in count (could be ownContact also)
            // all others get a mention count.
            Integer I = contactInCount.get(senderContact);
            contactInCount.put(senderContact, (I == null) ? 1 : I + 1);
        }
        if ((x & EmailDocument.SENT_MASK) == 0) {
            // this message is not sent, its received.
            // add mentions for everyone who's not me, who's on the to/cc/bcc of this message.
            Collection<Contact> toContacts = ed.getToCCBCCContacts(this);
            for (Contact c : toContacts) {
                if (c == ownContact)
                    // doesn't seem to make sense to give a mention count for sender in addition to incount
                    continue;
                Integer I = contactMentionCount.get(c);
                contactMentionCount.put(c, (I == null) ? 1 : I + 1);
            }
        }
    }
    JSONArray resultArray = new JSONArray();
    int count = 0;
    for (Contact c : allContacts) {
        if (c == ownContact && exceptOwner)
            continue;
        // out.println("<tr><td class=\"search\" title=\"" + c.toTooltip().replaceAll("\"", "").replaceAll("'", "") + "\">");
        int contactId = getContactId(c);
        // out.println ("<a style=\"text-decoration:none;color:inherit;\" href=\"browse?contact=" + contactId + "\">");
        String bestNameForContact = c.pickBestName();
        String url = "browse?adv-search=1&contact=" + contactId + "&archiveID=" + archiveID;
        String nameToPrint = Util.escapeHTML(Util.ellipsize(bestNameForContact, 50));
        Integer inCount = contactInCount.get(c), outCount = contactOutCount.get(c), mentionCount = contactMentionCount.get(c);
        if (inCount == null)
            inCount = 0;
        if (outCount == null)
            outCount = 0;
        if (mentionCount == null)
            mentionCount = 0;
        JSONArray j = new JSONArray();
        j.put(0, Util.escapeHTML(nameToPrint));
        j.put(1, inCount);
        j.put(2, outCount);
        j.put(3, mentionCount);
        j.put(4, url);
        j.put(5, Util.escapeHTML(c.toTooltip()));
        resultArray.put(count++, j);
    // could consider putting another string which has more info about the contact such as all names and email addresses... this could be shown on hover
    }
    return resultArray;
}
Also used : EmailDocument(edu.stanford.muse.index.EmailDocument) JSONArray(org.json.JSONArray)

Example 2 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class AddressBook method main.

public static void main(String[] args) {
    List<String> list = EmailUtils.parsePossibleNamesFromEmailAddress("mickey.mouse@disney.com");
    System.out.println(Util.join(list, " "));
    list = EmailUtils.parsePossibleNamesFromEmailAddress("donald_duck@disney.com");
    System.out.println(Util.join(list, " "));
    list = EmailUtils.parsePossibleNamesFromEmailAddress("70451.2444@compuserve.com");
    System.out.println(Util.join(list, " "));
    String ownerName = "Owner Name";
    String ownerEmail = "owner@example.com";
    {
        AddressBook ab = new AddressBook(new String[] { ownerEmail }, new String[] { ownerName });
        EmailDocument ed = new EmailDocument();
        try {
            ed.to = new Address[] { new InternetAddress("from@email.com", "From Last") };
            ed.cc = new Address[] { new InternetAddress("cc@email.com", "CC Last") };
            ed.to = new Address[] { new InternetAddress("to@example.com", "To Last") };
            ed.from = new Address[] { new InternetAddress("from@example.com", "From Last") };
        } catch (Exception e) {
            Util.print_exception(e, log);
        }
        ab.processContactsFromMessage(ed);
        // 4 addresses should be added + owner
        Util.ASSERT(ab.size() == 5);
    }
    {
        AddressBook ab = new AddressBook(new String[] { ownerEmail }, new String[] { ownerName });
        EmailDocument ed1 = new EmailDocument(), ed2 = new EmailDocument();
        try {
            ed1.to = new Address[] { new InternetAddress("Merge Name", "mergename@example.com") };
            ed1.from = new Address[] { new InternetAddress("Merge Name2", "mergename@example.com") };
            ed2.to = new Address[] { new InternetAddress("Merge X Name", "mergeemail1@example.com") };
            ed2.from = new Address[] { new InternetAddress("Merge X Name", "mergeemail2@example.com") };
        } catch (Exception e) {
            ab.processContactsFromMessage(ed1);
            ab.processContactsFromMessage(ed2);
            Util.ASSERT(ab.size() == 3);
        }
        // 2 names for this email address
        Util.ASSERT(ab.lookupByEmail("mergename@example.com").getNames().size() == 2);
    }
}
Also used : InternetAddress(javax.mail.internet.InternetAddress) Address(javax.mail.Address) InternetAddress(javax.mail.internet.InternetAddress) EmailDocument(edu.stanford.muse.index.EmailDocument)

Example 3 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class MuseEmailFetcher method fetchAndIndexEmails.

/**
 * key method to fetch actual email messages. can take a long time.
 * @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
 * @param selectedFolders is in the format <account name>^-^<folder name>
 * @param session is used only to put a status object in. can be null in which case status object is not set.
 * emailDocs, addressBook and blobstore
 * @throws NoDefaultFolderException
 */
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session) throws MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException {
    setupFetchers(-1);
    long startTime = System.currentTimeMillis();
    if (session != null)
        session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
    boolean op_cancelled = false, out_of_mem = false;
    BlobStore attachmentsStore = archive.getBlobStore();
    fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
    if (Util.nullOrEmpty(fetchers)) {
        log.warn("Trying to fetch email with no fetchers, setup not called ?");
        return;
    }
    setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
    List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
    // one fetcher will aggregate everything
    FetchStats stats = new FetchStats();
    MTEmailFetcher aggregatingFetcher = null;
    // a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
    long startTimeMillis = System.currentTimeMillis();
    for (MTEmailFetcher fetcher : fetchers) {
        if (session != null)
            session.setAttribute("statusProvider", fetcher);
        fetcher.setArchive(archive);
        fetcher.setFetchConfig(fetchConfig);
        log.info("Memory status before fetching emails: " + Util.getMemoryStats());
        // this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
        List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
        // but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
        if (fetcher.isCancelled()) {
            log.info("NOTE: fetcher operation was cancelled");
            op_cancelled = true;
            break;
        }
        if (fetcher.mayHaveRunOutOfMemory()) {
            log.warn("Fetcher operation ran out of memory " + fetcher);
            out_of_mem = true;
            break;
        }
        fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
        if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
            // first non-empty fetcher
            aggregatingFetcher = fetcher;
        if (aggregatingFetcher != null)
            aggregatingFetcher.merge(fetcher);
        // add the indexed folders to the stats
        EmailStore store = fetcher.getStore();
        String fetcherDescription = store.displayName + ":" + store.emailAddress;
        for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
    }
    if (op_cancelled)
        throw new CancelledException();
    if (out_of_mem)
        throw new OutOfMemoryError();
    if (aggregatingFetcher != null) {
        stats.importStats = aggregatingFetcher.stats;
        if (aggregatingFetcher.mayHaveRunOutOfMemory())
            throw new OutOfMemoryError();
    }
    // save memory
    aggregatingFetcher = null;
    long endTimeMillis = System.currentTimeMillis();
    long elapsedMillis = endTimeMillis - startTimeMillis;
    log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
    // note: this is all archive docs, not just the ones that may have been just imported
    List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
    archive.addFetchedFolderInfos(fetchedFolderInfos);
    if (allEmailDocs.size() == 0)
        log.warn("0 messages from email fetcher");
    EmailUtils.cleanDates(allEmailDocs);
    // create a new address book
    if (session != null)
        session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
    AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
    log.info("Address book stats: " + addressBook.getStats());
    if (session != null)
        session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
    archive.setAddressBook(addressBook);
    // we shouldn't really have dups now because the archive ensures that only unique docs are added
    // move sorting to archive.postprocess?
    EmailUtils.removeDupsAndSort(allEmailDocs);
    // report stats
    stats.lastUpdate = new Date().getTime();
    // (String) JSPHelper.getSessionAttribute(session, "userKey");
    stats.userKey = "USER KEY UNUSED";
    stats.fetchAndIndexTimeMillis = elapsedMillis;
    updateStats(archive, addressBook, stats);
    if (session != null)
        session.removeAttribute("statusProvider");
    log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
Also used : CancelledException(edu.stanford.muse.exceptions.CancelledException) EmailDocument(edu.stanford.muse.index.EmailDocument) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) BlobStore(edu.stanford.muse.datacache.BlobStore) Pair(edu.stanford.muse.util.Pair)

Example 4 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class EntityBook method fillSummaryFields.

public void fillSummaryFields(Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap, Archive archive) {
    JSONArray resultArray = new JSONArray();
    // trick to use count (modifiable variable) inside for each.
    final Integer[] count = { 0 };
    summary_L1_entityCountMap.clear();
    docsetmap.entrySet().forEach(entry -> {
        count[0] = count[0] + 1;
        Summary_L1 summary = new Summary_L1();
        summary.score = entry.getValue().first;
        summary.messages = entry.getValue().second;
        // get date range
        Collection<EmailDocument> emaildocs = summary.messages.stream().map(s -> (EmailDocument) s).collect(Collectors.toList());
        Pair<Date, Date> daterange = EmailUtils.getFirstLast(emaildocs, true);
        if (daterange == null) {
            daterange = new Pair<>(archive.collectionMetadata.firstDate, archive.collectionMetadata.lastDate);
        }
        if (daterange.first == null)
            daterange.first = archive.collectionMetadata.firstDate;
        if (daterange.second == null)
            daterange.second = archive.collectionMetadata.lastDate;
        summary.startDate = daterange.first;
        summary.endDate = daterange.second;
        summary_L1_entityCountMap.put(entry.getKey(), summary);
        String entity = entry.getKey().getDisplayName();
        JSONArray j = new JSONArray();
        Short etype = entityType;
        Set<String> altNamesSet = entry.getKey().getAltNames();
        String altNames = (altNamesSet == null) ? "" : "Alternate names: " + Util.join(altNamesSet, ";");
        j.put(0, Util.escapeHTML(entity));
        j.put(1, summary.score);
        j.put(2, summary.messages.size());
        j.put(3, altNames);
        if (summary.startDate != null)
            j.put(4, new SimpleDateFormat("MM/dd/yyyy").format(summary.startDate));
        else
            j.put(4, summary.startDate);
        if (summary.endDate != null)
            j.put(5, new SimpleDateFormat("MM/dd/yyyy").format(summary.endDate));
        else
            j.put(5, summary.endDate);
        // add entity type as well..
        j.put(6, NEType.getTypeForCode(entityType).getDisplayName());
        resultArray.put(count[0] - 1, j);
    });
    summary_JSON = resultArray;
}
Also used : java.util(java.util) BufferedWriter(java.io.BufferedWriter) Util(edu.stanford.muse.util.Util) SimpleDateFormat(java.text.SimpleDateFormat) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Doc(javax.print.Doc) Serializable(java.io.Serializable) Document(edu.stanford.muse.index.Document) NameTypes(edu.stanford.muse.ie.NameTypes) Pair(edu.stanford.muse.util.Pair) Logger(org.apache.logging.log4j.Logger) Archive(edu.stanford.muse.index.Archive) NEType(edu.stanford.muse.ner.model.NEType) EmailDocument(edu.stanford.muse.index.EmailDocument) BufferedReader(java.io.BufferedReader) Entity(edu.stanford.muse.ner.Entity) EmailUtils(edu.stanford.muse.util.EmailUtils) LogManager(org.apache.logging.log4j.LogManager) JSONArray(org.json.JSONArray) EmailDocument(edu.stanford.muse.index.EmailDocument) JSONArray(org.json.JSONArray) SimpleDateFormat(java.text.SimpleDateFormat)

Example 5 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class EntityFeature method checkIndex.

/**
 * @arg2 force creation of index irrespective of previous existence of the
 *       index.
 *       Checks and creates index if required.
 * @return true if successful
 */
private boolean checkIndex(Archive archive, boolean force) {
    Boolean exists = indexExists(archive);
    int c1 = 0, c2 = 0, c3 = 0;
    int g1 = 0, g2 = 0, g3 = 0;
    int f1 = 0, f2 = 0, f3 = 0;
    boolean istatus = true;
    if (force || (!exists)) {
        Map<String, EntityFeature> features = new HashMap<>();
        Collection<EmailDocument> docs = (Collection) archive.getAllDocs();
        int totalEntities = 0;
        log.info("No feature index found..., starting to process and index. This can take a while.");
        int di = 0;
        for (EmailDocument ed : docs) {
            if (cancel) {
                clean(archive);
                return false;
            }
            if (di % 1000 == 0) {
                JSPHelper.log.info("Done analysing documents: " + di + " of: " + docs.size());
                status = "Analyzed " + di + "/" + docs.size() + " email documents";
                pctComplete = ((double) di * 50) / (double) docs.size();
            }
            di++;
            List<Span> names;
            try {
                names = Arrays.asList(archive.getAllNamesInDoc(ed, true));
            } catch (IOException ioe) {
                log.error("Problem accessing entities in " + ed.getUniqueId(), ioe);
                continue;
            }
            List<String> entities = names.stream().filter(n -> n.type == NEType.Type.PERSON.getCode()).map(n -> n.text).collect(Collectors.toList());
            List<String> places = names.stream().filter(n -> n.type == NEType.Type.PLACE.getCode()).map(n -> n.text).collect(Collectors.toList());
            List<String> orgs = names.stream().filter(n -> n.type == NEType.Type.ORGANISATION.getCode()).map(n -> n.text).collect(Collectors.toList());
            if (entities != null)
                c1 += entities.size();
            if (orgs != null)
                c2 += orgs.size();
            if (places != null)
                c3 += places.size();
            Map<String, String> goodNames = new HashMap<>();
            List<String> correspondents = ed.getAllNames();
            List<String> addresses = ed.getAllAddrs();
            if (correspondents != null)
                for (String c : correspondents) {
                    if (c != null && c.contains(" ")) {
                        // EmailUtils.normalizePersonNameForLookup(c);
                        String n = IndexUtils.canonicalizeEntity(c);
                        goodNames.put(n, "person");
                    }
                }
            for (String e : entities) {
                if (e != null && e.contains(" ")) {
                    String canonicalEntity = IndexUtils.canonicalizeEntity(e);
                    if (canonicalEntity == null)
                        continue;
                    goodNames.put(canonicalEntity, "person");
                    g1++;
                }
            }
            for (String o : orgs) {
                String canonicalEntity = IndexUtils.canonicalizeEntity(o);
                if (canonicalEntity == null)
                    continue;
                goodNames.put(canonicalEntity, "org");
                g2++;
            }
            for (String p : places) {
                String canonicalEntity = IndexUtils.canonicalizeEntity(p);
                if (canonicalEntity == null)
                    continue;
                goodNames.put(canonicalEntity, "places");
                g3++;
            }
            // O(goodNames.size())
            for (String gn : goodNames.keySet()) {
                if (features.get(gn) == null) {
                    if (goodNames.get(gn).equals("person")) {
                        features.put(gn, new EntityFeature(gn, EntityFeature.PERSON));
                        f1++;
                    } else if (goodNames.get(gn).equals("org")) {
                        features.put(gn, new EntityFeature(gn, EntityFeature.ORG));
                        f2++;
                    } else if (goodNames.get(gn).equals("places")) {
                        features.put(gn, new EntityFeature(gn, EntityFeature.PLACE));
                        f3++;
                    }
                }
                features.get(gn).accountForThis();
                features.get(gn).addAllCE(goodNames.keySet());
                if (addresses != null)
                    features.get(gn).addAllEA(addresses);
                features.get(gn).priorProbablity = features.get(gn).priorProbablity + 1.0;
                totalEntities++;
            }
        }
        log.info("Found: " + c1 + " entities, " + c2 + " orgs and " + c3 + " places");
        log.info("Gn: " + g1 + " entities, " + g2 + " orgs and " + g3 + " places");
        log.info("Found goodfeatures: " + f1 + " entities, " + f2 + " orgs and " + f3 + " places");
        for (String key : features.keySet()) features.get(key).priorProbablity = features.get(key).priorProbablity / (double) totalEntities;
        log.info("Done analysing docs. Starting to index.");
        istatus = index(features, archive);
    }
    return istatus;
}
Also used : java.util(java.util) Config(edu.stanford.muse.Config) TypeToken(com.google.gson.reflect.TypeToken) CharArraySet(org.apache.lucene.analysis.CharArraySet) StringField(org.apache.lucene.document.StringField) JSPHelper(edu.stanford.muse.webapp.JSPHelper) ArchiveReaderWriter(edu.stanford.muse.index.ArchiveReaderWriter) StatusProvider(edu.stanford.muse.email.StatusProvider) OpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode) JSONUtils(edu.stanford.muse.util.JSONUtils) Document(org.apache.lucene.document.Document) Gson(com.google.gson.Gson) org.apache.lucene.search(org.apache.lucene.search) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Span(edu.stanford.muse.util.Span) IOException(java.io.IOException) IndexUtils(edu.stanford.muse.index.IndexUtils) Collectors(java.util.stream.Collectors) File(java.io.File) Serializable(java.io.Serializable) org.apache.lucene.index(org.apache.lucene.index) Pair(edu.stanford.muse.util.Pair) Logger(org.apache.logging.log4j.Logger) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Archive(edu.stanford.muse.index.Archive) Field(org.apache.lucene.document.Field) NEType(edu.stanford.muse.ner.model.NEType) TextField(org.apache.lucene.document.TextField) EmailDocument(edu.stanford.muse.index.EmailDocument) StopAnalyzer(org.apache.lucene.analysis.core.StopAnalyzer) LogManager(org.apache.logging.log4j.LogManager) StringEscapeUtils(org.apache.commons.lang.StringEscapeUtils) EmailDocument(edu.stanford.muse.index.EmailDocument) IOException(java.io.IOException) Span(edu.stanford.muse.util.Span)

Aggregations

EmailDocument (edu.stanford.muse.index.EmailDocument)18 Pair (edu.stanford.muse.util.Pair)7 Document (edu.stanford.muse.index.Document)6 Archive (edu.stanford.muse.index.Archive)4 JSONArray (org.json.JSONArray)4 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)3 NEType (edu.stanford.muse.ner.model.NEType)3 SimpleDateFormat (java.text.SimpleDateFormat)3 java.util (java.util)3 Collectors (java.util.stream.Collectors)3 Address (javax.mail.Address)3 InternetAddress (javax.mail.internet.InternetAddress)3 LogManager (org.apache.logging.log4j.LogManager)3 Logger (org.apache.logging.log4j.Logger)3 Contact (edu.stanford.muse.AddressBookManager.Contact)2 Blob (edu.stanford.muse.datacache.Blob)2 BlobStore (edu.stanford.muse.datacache.BlobStore)2 CancelledException (edu.stanford.muse.exceptions.CancelledException)2 ArchiveReaderWriter (edu.stanford.muse.index.ArchiveReaderWriter)2 EmailUtils (edu.stanford.muse.util.EmailUtils)2