Search in sources :

Example 11 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class AddressBook method getOutMessageCount.

/**
 * how many of the messages in the given collection are outgoing?
 */
public int getOutMessageCount(Collection<EmailDocument> docs) {
    int count = 0;
    Contact me = getContactForSelf();
    if (me != null) {
        for (EmailDocument ed : docs) {
            String fromEmail = ed.getFromEmailAddress();
            Set<String> selfAddrs = me.getEmails();
            if (selfAddrs.contains(fromEmail))
                count++;
        }
    }
    return count;
}
Also used : EmailDocument(edu.stanford.muse.index.EmailDocument)

Example 12 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class SimpleSessions method saveArchive.

/**
 * saves the archive in the current session to the cachedir. note: no blobs saved.
 */
public static boolean saveArchive(String baseDir, String name, Archive archive) throws IOException {
    log.info("Before saving the archive checking if it is still in good shape");
    archive.Verify();
    String dir = baseDir + File.separatorChar + Archive.SESSIONS_SUBDIR;
    // just to be safe
    new File(dir).mkdirs();
    String filename = dir + File.separatorChar + name + SimpleSessions.SESSION_SUFFIX;
    log.info("Saving archive to (session) file " + filename);
    /*//file path names of addressbook, entitybook and correspondentAuthorityMapper data.
		String addressBookPath = dir + File.separatorChar + Archive.ADDRESSBOOK_SUFFIX;
		String entityBookPath = dir + File.separatorChar + Archive.ENTITYBOOK_SUFFIX;
		String cAuthorityPath =  dir + File.separatorChar + Archive.CAUTHORITYMAPPER_SUFFIX;
		*/
    if (archive.collectionMetadata == null)
        archive.collectionMetadata = new Archive.CollectionMetadata();
    archive.collectionMetadata.timestamp = new Date().getTime();
    archive.collectionMetadata.tz = TimeZone.getDefault().getID();
    archive.collectionMetadata.nDocs = archive.getAllDocs().size();
    archive.collectionMetadata.nUniqueBlobs = archive.blobStore.uniqueBlobs.size();
    int totalAttachments = 0, images = 0, docs = 0, others = 0, sentMessages = 0, receivedMessages = 0, hackyDates = 0;
    Date firstDate = null, lastDate = null;
    for (Document d : archive.getAllDocs()) {
        if (!(d instanceof EmailDocument))
            continue;
        EmailDocument ed = (EmailDocument) d;
        if (ed.date != null) {
            if (ed.hackyDate)
                hackyDates++;
            else {
                if (firstDate == null || ed.date.before(firstDate))
                    firstDate = ed.date;
                if (lastDate == null || ed.date.after(lastDate))
                    lastDate = ed.date;
            }
        }
        int sentOrReceived = ed.sentOrReceived(archive.addressBook);
        if ((sentOrReceived & EmailDocument.SENT_MASK) != 0)
            sentMessages++;
        if ((sentOrReceived & EmailDocument.RECEIVED_MASK) != 0)
            receivedMessages++;
        if (!Util.nullOrEmpty(ed.attachments)) {
            totalAttachments += ed.attachments.size();
            for (Blob b : ed.attachments) if (!Util.nullOrEmpty(b.filename)) {
                if (Util.is_image_filename(b.filename))
                    images++;
                else if (Util.is_doc_filename(b.filename))
                    docs++;
                else
                    others++;
            }
        }
    }
    archive.collectionMetadata.firstDate = firstDate;
    archive.collectionMetadata.lastDate = lastDate;
    archive.collectionMetadata.nIncomingMessages = receivedMessages;
    archive.collectionMetadata.nOutgoingMessages = sentMessages;
    archive.collectionMetadata.nHackyDates = hackyDates;
    archive.collectionMetadata.nBlobs = totalAttachments;
    archive.collectionMetadata.nUniqueBlobs = archive.blobStore.uniqueBlobs.size();
    archive.collectionMetadata.nImageBlobs = images;
    archive.collectionMetadata.nDocBlobs = docs;
    archive.collectionMetadata.nOtherBlobs = others;
    try (ObjectOutputStream oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(filename)))) {
        oos.writeObject("archive");
        oos.writeObject(archive);
    } catch (Exception e1) {
        Util.print_exception("Failed to write archive: ", e1, log);
    }
    // Now write modular transient fields to separate files-
    // By Dec 2017 there are three transient fields which will be saved and loaded separately
    // 1. AddressBook -- Stored in a gzip file with name in the same `	directory as of archive.
    // 2. EntityBook
    // 3. CorrespondentAuthorityMapper
    // Before final release of v5 in Feb 2018, modularize annotation out of archive.
    // ///////////////AddressBook Writing -- In human readable form ///////////////////////////////////
    SimpleSessions.saveAddressBook(archive);
    // //////////////EntityBook Writing -- In human readable form/////////////////////////////////////
    SimpleSessions.saveEntityBook(archive);
    // /////////////CAuthorityMapper Writing-- Serialized///////////////////////////////
    SimpleSessions.saveCorrespondentAuthorityMapper(archive);
    // ////////////LabelManager Writing -- Serialized//////////////////////////////////
    SimpleSessions.saveLabelManager(archive);
    // ////////////AnnotationManager writing-- In human readable form/////////////////////////////////////
    SimpleSessions.saveAnnotations(archive);
    writeCollectionMetadata(archive.collectionMetadata, baseDir);
    /*

        // now write out the metadata
		String processingFilename = dir + File.separatorChar + name + Config.COLLECTION_METADATA_FILE;
		oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(processingFilename)));
		try {
			oos.writeObject(archive.collectionMetadata);
		} catch (Exception e1) {
            Util.print_exception("Failed to write archive's metadata: ", e1, log);
			oos.close();
		} finally {
			oos.close();
		}
		*/
    /*

		if (archive.correspondentAuthorityMapper!= null) {
			String authorityMapperFilename = dir + File.separatorChar + name + Config.AUTHORITIES_FILENAME;
			oos = new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(authorityMapperFilename)));
			try {
				oos.writeObject(archive.correspondentAuthorityMapper);
			} catch (Exception e1) {
				Util.print_exception("Failed to write archive's authority mapper: ", e1, log);
				oos.close();
			} finally {
				oos.close();
			}
		}
*/
    archive.close();
    // re-open for reading
    archive.openForRead();
    // note: no need of saving archive authorities separately -- they are already saved as part of the archive object
    return true;
}
Also used : CollectionMetadata(edu.stanford.muse.index.Archive.CollectionMetadata) Blob(edu.stanford.muse.datacache.Blob) GZIPOutputStream(java.util.zip.GZIPOutputStream) EmailDocument(edu.stanford.muse.index.EmailDocument) Document(edu.stanford.muse.index.Document) EmailDocument(edu.stanford.muse.index.EmailDocument) ParseException(org.apache.lucene.queryparser.classic.ParseException) LockObtainFailedException(org.apache.lucene.store.LockObtainFailedException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException)

Example 13 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class AddressBook method sortedContacts.

/**
 * returns a list of all contacts in the given collection of docs, sorted by outgoing freq.
 */
public List<Contact> sortedContacts(Collection<EmailDocument> docs) {
    Map<Contact, Integer> contactInCount = new LinkedHashMap<>(), contactOutCount = new LinkedHashMap<>();
    // we'll also count the recipient twice if he sends a message to himself
    for (EmailDocument ed : docs) {
        String senderEmail = ed.getFromEmailAddress();
        List<String> allEmails = ed.getAllAddrs();
        for (String email : allEmails) {
            Contact c = lookupByEmail(email);
            if (c != null) {
                if (senderEmail.equals(email)) {
                    Integer I = contactOutCount.get(c);
                    contactOutCount.put(c, (I == null) ? 1 : I + 1);
                } else {
                    Integer I = contactInCount.get(c);
                    contactInCount.put(c, (I == null) ? 1 : I + 1);
                }
            }
        }
    }
    // sort by in count -- note that when processing sent email, in count is the # of messages sent by the owner of the archive to the person #confusing
    List<Pair<Contact, Integer>> pairs = Util.sortMapByValue(contactInCount);
    Set<Contact> sortedContactsSet = new LinkedHashSet<>();
    for (Pair<Contact, Integer> p : pairs) sortedContactsSet.add(p.getFirst());
    // then by out count.
    pairs = Util.sortMapByValue(contactOutCount);
    for (Pair<Contact, Integer> p : pairs) sortedContactsSet.add(p.getFirst());
    for (Contact c : sortedContactsSet) if (getContactId(c) < 0)
        Util.warnIf(true, "Contact has -ve contact id: " + c, log);
    return new ArrayList<>(sortedContactsSet);
}
Also used : EmailDocument(edu.stanford.muse.index.EmailDocument) Pair(edu.stanford.muse.util.Pair)

Example 14 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class AddressBook method fillL1_SummaryObject.

public void fillL1_SummaryObject(Collection<Document> alldocs) {
    // clear the summary objects.
    L1_Summary_SentDocs.clear();
    L1_Summary_ReceivedDocs.clear();
    L1_Summary_RecFrmOwnerDocs.clear();
    // L1_Summary_totalInDocs.clear();
    // Archive archive = ArchiveReaderWriter.getArchiveForArchiveID(archiveID);
    AddressBook ab = this;
    Contact ownContact = ab.getContactForSelf();
    List<Contact> allContacts = ab.sortedContacts((Collection) alldocs);
    List<EmailDocument> alldocslist = alldocs.stream().map(e -> (EmailDocument) e).collect(Collectors.toList());
    // compute counts
    for (EmailDocument ed : alldocslist) {
        String senderEmail = ed.getFromEmailAddress();
        Contact senderContact = ab.lookupByEmail(senderEmail);
        if (senderContact == null)
            continue;
        // senderContact = ownContact; // should never happen, we should always have a sender contact: Don't do this otherwise wrongly reporting a message sent by owner.
        // L1_Summary_SentDocs store the email messages where this contact is a sender.
        {
            Set<EmailDocument> tmp = L1_Summary_SentDocs.get(senderContact);
            if (tmp == null)
                tmp = new LinkedHashSet<>();
            tmp.add(ed);
            L1_Summary_SentDocs.put(senderContact, tmp);
        }
        // get receiver of this mail.
        Collection<Contact> toContacts = ed.getToCCBCCContacts(ab);
        for (Contact c : toContacts) {
            // add the info that all these contacts received this mail.
            Set<EmailDocument> tmp = L1_Summary_ReceivedDocs.get(c);
            if (tmp == null)
                tmp = new LinkedHashSet<>();
            tmp.add(ed);
            L1_Summary_ReceivedDocs.put(c, tmp);
        }
        // if this mail was received from the owner then add it in the map L1_Summary_RecFrmOwner
        // for filling other fields.
        int x = ed.sentOrReceived(ab);
        // message could be both sent and received
        if ((x & EmailDocument.SENT_MASK) != 0) {
            // this is a sent email (sent by the owner), each to/cc/bcc gets +1 outcount.
            // one of them could be own contact also.
            toContacts = ed.getToCCBCCContacts(ab);
            for (Contact c : toContacts) {
                Set<EmailDocument> tmp = L1_Summary_RecFrmOwnerDocs.get(c);
                if (tmp == null)
                    tmp = new LinkedHashSet<>();
                tmp.add(ed);
                L1_Summary_RecFrmOwnerDocs.put(c, tmp);
            }
        }
    /* boolean received = (x & EmailDocument.RECEIVED_MASK) != 0 // explicitly received
                    || (x & EmailDocument.SENT_MASK) == 0; // its not explicitly sent, so we must count it as received by default

            if (received) {
                // sender gets a +1 in count (could be ownContact also)
                // all others get a mention count.
                Set<EmailDocument> tmp = L1_Summary_RecFrmOwnerDocs.get(senderContact);
                if(tmp==null)
                    tmp=new LinkedHashSet<>();
                tmp.add(ed);
                L1_Summary_RecFrmOwnerDocs.put(senderContact, tmp);
            }
*/
    // Removed the mention semantics in v7.
    /* if ((x & EmailDocument.SENT_MASK) == 0) {
                // this message is not sent, its received.
                // add mentions for everyone who's not me, who's on the to/cc/bcc of this message.
                Collection<Contact> toContacts = ed.getToCCBCCContacts(ab);
                for (Contact c : toContacts) {
                    if (c == ownContact)
                        continue; // doesn't seem to make sense to give a mention count for sender in addition to incount
                    Set<EmailDocument> tmp = L1_Summary_contactMentionDocs.get(c);
                    if(tmp==null)
                        tmp=new LinkedHashSet<>();
                    tmp.add(ed);
                    L1_Summary_contactMentionDocs.put(c, tmp);
                }
            }*/
    }
}
Also used : Address(javax.mail.Address) java.util(java.util) Util(edu.stanford.muse.util.Util) SimpleDateFormat(java.text.SimpleDateFormat) ArchiveReaderWriter(edu.stanford.muse.index.ArchiveReaderWriter) Multimap(com.google.common.collect.Multimap) Collectors(java.util.stream.Collectors) SetMultimap(com.google.common.collect.SetMultimap) Document(edu.stanford.muse.index.Document) InternetAddress(javax.mail.internet.InternetAddress) LabelManager(edu.stanford.muse.LabelManager.LabelManager) Pair(edu.stanford.muse.util.Pair) Logger(org.apache.logging.log4j.Logger) java.io(java.io) Archive(edu.stanford.muse.index.Archive) EmailDocument(edu.stanford.muse.index.EmailDocument) EmailUtils(edu.stanford.muse.util.EmailUtils) LogManager(org.apache.logging.log4j.LogManager) LinkedHashMultimap(com.google.common.collect.LinkedHashMultimap) JSONArray(org.json.JSONArray) DictUtils(edu.stanford.muse.util.DictUtils) EmailDocument(edu.stanford.muse.index.EmailDocument)

Example 15 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class NameExpansion method getMatches.

/* Given the string s in emailDocument ed, returns a matches object with candidates matching s */
public static Matches getMatches(String s, Archive archive, EmailDocument ed, int maxResults) {
    Matches matches = new Matches(s, maxResults);
    AddressBook ab = archive.addressBook;
    List<Contact> contactsExceptSelf = ed.getParticipatingContactsExceptOwn(archive.addressBook);
    List<Contact> contacts = new ArrayList(contactsExceptSelf);
    contacts.add(ab.getContactForSelf());
    // check if s matches any contacts on this message
    outer: for (Contact c : contacts) {
        if (c.getNames() == null)
            continue;
        for (String name : c.getNames()) {
            StringMatchType matchType = Matches.match(s, name);
            if (matchType != null) {
                float score = 1.0F;
                if (matches.addMatch(name, score, matchType, "Name of a contact on this message", true))
                    return matches;
                continue outer;
            }
        }
    }
    // check if s matches anywhere else in this message
    if (matchAgainstEmailContent(archive, ed, matches, "Mentioned elsewhere in this message", 1.0F)) {
        return matches;
    }
    synchronized (archive) {
        if (ed.threadID == 0L) {
            archive.assignThreadIds();
        }
    }
    // check if s matches anywhere else in this thread
    List<EmailDocument> messagesInThread = (List) archive.docsWithThreadId(ed.threadID);
    for (EmailDocument messageInThread : messagesInThread) {
        if (matchAgainstEmailContent(archive, messageInThread, matches, "Mentioned in this thread", 0.9F)) {
            return matches;
        }
    }
    // check if s matches any other email with any of these correspondents
    for (Contact c : contactsExceptSelf) {
        if (c.getEmails() != null) {
            String correspondentsSearchStr = String.join(";", c.getEmails());
            // As filterForCorrespondents function do not use queryparams therefore it is fine to instantiate SearchResult
            // object with queryParams as null. After refactoring, filter methods take SearchObject as input and modify it
            // according to the filter.
            SearchResult inputSet = new SearchResult(archive, null);
            SearchResult outputSet = SearchResult.filterForCorrespondents(inputSet, correspondentsSearchStr, true, true, true, true);
            Set<Document> messagesWithSameCorrespondents = outputSet.getDocumentSet();
            for (Document messageWithSameCorrespondents : messagesWithSameCorrespondents) {
                EmailDocument edoc = (EmailDocument) messageWithSameCorrespondents;
                if (matchAgainstEmailContent(archive, edoc, matches, "Mentioned in other messages with these correspondents", 0.8F)) {
                    return matches;
                }
            }
        }
    }
    // search for s anywhere in the archive
    Multimap<String, String> params = LinkedHashMultimap.create();
    params.put("termSubject", "on");
    params.put("termBody", "on");
    String term = s;
    if (s.contains(" ") && (!s.startsWith("\"") || !s.endsWith("\""))) {
        term = "\"" + s + "\"";
    }
    // To search for terms, create a searchResult object and invoke appropriate filter method on it.
    SearchResult inputSet = new SearchResult(archive, params);
    SearchResult outputSet = SearchResult.searchForTerm(inputSet, term);
    Set<Document> docsWithTerm = outputSet.getDocumentSet();
    for (Document docWithTerm : docsWithTerm) {
        EmailDocument edoc = (EmailDocument) docWithTerm;
        if (matchAgainstEmailContent(archive, edoc, matches, "Mentioned elsewhere in this archive", 0.7F))
            return matches;
    }
    return matches;
}
Also used : EmailDocument(edu.stanford.muse.index.EmailDocument) SearchResult(edu.stanford.muse.index.SearchResult) Document(edu.stanford.muse.index.Document) EmailDocument(edu.stanford.muse.index.EmailDocument) Contact(edu.stanford.muse.AddressBookManager.Contact) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook)

Aggregations

EmailDocument (edu.stanford.muse.index.EmailDocument)18 Pair (edu.stanford.muse.util.Pair)7 Document (edu.stanford.muse.index.Document)6 Archive (edu.stanford.muse.index.Archive)4 JSONArray (org.json.JSONArray)4 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)3 NEType (edu.stanford.muse.ner.model.NEType)3 SimpleDateFormat (java.text.SimpleDateFormat)3 java.util (java.util)3 Collectors (java.util.stream.Collectors)3 Address (javax.mail.Address)3 InternetAddress (javax.mail.internet.InternetAddress)3 LogManager (org.apache.logging.log4j.LogManager)3 Logger (org.apache.logging.log4j.Logger)3 Contact (edu.stanford.muse.AddressBookManager.Contact)2 Blob (edu.stanford.muse.datacache.Blob)2 BlobStore (edu.stanford.muse.datacache.BlobStore)2 CancelledException (edu.stanford.muse.exceptions.CancelledException)2 ArchiveReaderWriter (edu.stanford.muse.index.ArchiveReaderWriter)2 EmailUtils (edu.stanford.muse.util.EmailUtils)2