Search in sources :

Example 16 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class EntityBookManager method recalculateCache.

/*
This method recalculates cache for entitybook of given type. If type is given as Max, it does it for all at once. This method was carved out mainly to reduce the recalculation of
individual type entitybook (which involves expensive operation of lucene search for each doc).
 */
private void recalculateCache(Short giventype) {
    log.info("Computing EntityBook Cache");
    long start = System.currentTimeMillis();
    // a subtle issue: If type is Short.MAX_VALUE then we need to have docsetmap one for each type.
    // so create a map of this map.
    Map<Short, Map<MappedEntity, Pair<Double, Set<Document>>>> alldocsetmap = new LinkedHashMap<>();
    // now fill this map.
    if (giventype == Short.MAX_VALUE) {
        for (NEType.Type t : NEType.Type.values()) {
            Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
            alldocsetmap.put(t.getCode(), docsetmap);
        }
    } else {
        Map<MappedEntity, Pair<Double, Set<Document>>> docsetmap = new LinkedHashMap<>();
        alldocsetmap.put(giventype, docsetmap);
    }
    // iterate over
    // iterate over lucene doc to recalculate the count and other summaries of the modified
    // fill cache summary for ebook in other fields of ebook.
    double theta = 0.001;
    long luceneduration1 = 0;
    long luceneduration2 = 0;
    long additionduration = 0;
    Map<String, Span[]> docEntitiesMap = mArchive.getAllEntities(mArchive.getAllDocs().size());
    for (String docid : docEntitiesMap.keySet()) {
        Span[] allspans = docEntitiesMap.get(docid);
        EmailDocument edoc = mArchive.indexer.docForId(docid);
        for (Span span : allspans) {
            // bail out if not of entity type that we're looking for, or not enough confidence, but don't bail out if we have to do it for all types, i.e. type is Short.MAX_TYPE
            if (giventype != Short.MAX_VALUE && (span.type != giventype || span.typeScore < theta))
                continue;
            // if type is Short.Max_Type then set the type as the current type, if not this is like a NOP.
            Short type = span.type;
            Double score = new Double(span.typeScore);
            String name = span.getText();
            String canonicalizedname = EntityBook.canonicalize(name);
            // map the name to its display name. if no mapping, we should get the same name back as its displayName
            MappedEntity mappedEntity = (mTypeToEntityBook.get(type).nameToMappedEntity.get(canonicalizedname));
            if (mappedEntity == null) {
                // It implies that we have erased some names from the entitybook so no need to consider them.
                continue;
            }
            // add this doc in the docsetmap for the mappedEntity.
            Double oldscore = Double.valueOf(0);
            if (alldocsetmap.get(type).get(mappedEntity) != null)
                oldscore = alldocsetmap.get(type).get(mappedEntity).first;
            Double finalscore = Double.max(oldscore, score);
            Set<Document> docset = new LinkedHashSet<>();
            if (alldocsetmap.get(type).get(mappedEntity) != null)
                docset = alldocsetmap.get(type).get(mappedEntity).second;
            docset.add(edoc);
            // docset.add(doc);
            alldocsetmap.get(type).put(mappedEntity, new Pair(finalscore, docset));
        }
    }
    // fill cache summary for ebook in other fields of ebook.
    // Beware!! what happens if type is MAX (means we need to do this for all types).
    long end = System.currentTimeMillis();
    log.info("Finished computing entitybook cache in " + (end - start) + " milliseconds");
    if (giventype == Short.MAX_VALUE) {
        for (NEType.Type t : NEType.Type.values()) {
            mTypeToEntityBook.get(t.getCode()).fillSummaryFields(alldocsetmap.get(t.getCode()), mArchive);
        }
    } else
        mTypeToEntityBook.get(giventype).fillSummaryFields(alldocsetmap.get(giventype), mArchive);
    // log.info("Luceneduration 1 = "+luceneduration1+" milliseconds, Luceneduration 2 = "+luceneduration2 + " milliseconds, addition duration = "+additionduration+ " milliseconds");
    // log.info("Finished filling summary of entitybook cache in "+ (System.currentTimeMillis()-end)+" milliseconds");
    log.info("EntityBook Cache computed successfully");
}
Also used : EmailDocument(edu.stanford.muse.index.EmailDocument) Document(edu.stanford.muse.index.Document) EmailDocument(edu.stanford.muse.index.EmailDocument) Span(edu.stanford.muse.util.Span) NEType(edu.stanford.muse.ner.model.NEType) Pair(edu.stanford.muse.util.Pair)

Example 17 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class EmailThread method hasSameRecipients.

/**
 * checks that this thread and the other email have exactly the same set of recipient email addresses
 */
private boolean hasSameRecipients(EmailDocument other) {
    if (emails == null || emails.size() == 0)
        return false;
    EmailDocument firstEmail = emails.get(0);
    List<String> x1 = firstEmail.getAllAddrs();
    List<String> x2 = other.getAllAddrs();
    Collections.sort(x1);
    Collections.sort(x2);
    if (x1.size() != x2.size())
        return false;
    for (int i = 0; i < x1.size(); i++) if (!x1.get(i).equals(x2.get(i)))
        return false;
    return true;
}
Also used : EmailDocument(edu.stanford.muse.index.EmailDocument)

Example 18 with EmailDocument

use of edu.stanford.muse.index.EmailDocument in project epadd by ePADD.

the class MuseEmailFetcher method fetchAndIndexEmails.

/**
 * key method to fetch actual email messages. can take a long time.
 * @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
 * @param selectedFolders is in the format <account name>^-^<folder name>
 * @param session is used only to put a status object in. can be null in which case status object is not set.
 * emailDocs, addressBook and blobstore
 * @throws NoDefaultFolderException
 */
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session, Consumer<StatusProvider> setStatusProvider) throws InterruptedException, JSONException, NoDefaultFolderException, CancelledException {
    setupFetchers(-1);
    long startTime = System.currentTimeMillis();
    setStatusProvider.accept(new StaticStatusProvider("Starting to process messages..."));
    // if (session != null)
    // session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
    boolean op_cancelled = false, out_of_mem = false;
    BlobStore attachmentsStore = archive.getBlobStore();
    fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
    if (Util.nullOrEmpty(fetchers)) {
        log.warn("Trying to fetch email with no fetchers, setup not called ?");
        return;
    }
    setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
    List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
    // one fetcher will aggregate everything
    FetchStats stats = new FetchStats();
    MTEmailFetcher aggregatingFetcher = null;
    // a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
    long startTimeMillis = System.currentTimeMillis();
    for (MTEmailFetcher fetcher : fetchers) {
        // in theory, different iterations of this loop could be run in parallel ("archive" access will be synchronized)
        setStatusProvider.accept(fetcher);
        /*if (session != null)
				session.setAttribute("statusProvider", fetcher);
*/
        fetcher.setArchive(archive);
        fetcher.setFetchConfig(fetchConfig);
        log.info("Memory status before fetching emails: " + Util.getMemoryStats());
        // this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
        List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
        // but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
        if (fetcher.isCancelled()) {
            log.info("NOTE: fetcher operation was cancelled");
            op_cancelled = true;
            break;
        }
        if (fetcher.mayHaveRunOutOfMemory()) {
            log.warn("Fetcher operation ran out of memory " + fetcher);
            out_of_mem = true;
            break;
        }
        fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
        if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
            // first non-empty fetcher
            aggregatingFetcher = fetcher;
        if (aggregatingFetcher != null)
            aggregatingFetcher.merge(fetcher);
        // add the indexed folders to the stats
        EmailStore store = fetcher.getStore();
        String fetcherDescription = store.displayName + ":" + store.emailAddress;
        for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
    }
    if (op_cancelled)
        throw new CancelledException();
    if (out_of_mem)
        throw new OutOfMemoryError();
    if (aggregatingFetcher != null) {
        stats.importStats = aggregatingFetcher.stats;
        if (aggregatingFetcher.mayHaveRunOutOfMemory())
            throw new OutOfMemoryError();
    }
    // save memory
    aggregatingFetcher = null;
    long endTimeMillis = System.currentTimeMillis();
    long elapsedMillis = endTimeMillis - startTimeMillis;
    log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
    // note: this is all archive docs, not just the ones that may have been just imported
    List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
    archive.addFetchedFolderInfos(fetchedFolderInfos);
    if (allEmailDocs.size() == 0)
        log.warn("0 messages from email fetcher");
    // EmailUtils.cleanDates(allEmailDocs);
    // create a new address book
    // if (session != null)
    // session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
    setStatusProvider.accept(new StaticStatusProvider("Building address book..."));
    AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
    log.info("Address book created!!");
    log.info("Address book stats: " + addressBook.getStats());
    // if (session != null)
    // session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
    setStatusProvider.accept(new StaticStatusProvider("Finishing up..."));
    archive.setAddressBook(addressBook);
    // we shouldn't really have dups now because the archive ensures that only unique docs are added
    // move sorting to archive.postprocess?
    EmailUtils.removeDupsAndSort(allEmailDocs);
    // report stats
    stats.lastUpdate = new Date().getTime();
    // For issue #254.
    stats.archiveOwnerInput = name;
    stats.archiveTitleInput = archiveTitle;
    stats.primaryEmailInput = alternateEmailAddrs;
    stats.emailSourcesInput = emailSources;
    // ////
    // (String) JSPHelper.getSessionAttribute(session, "userKey");
    stats.userKey = "USER KEY UNUSED";
    stats.fetchAndIndexTimeMillis = elapsedMillis;
    updateStats(archive, addressBook, stats);
    // if (session != null)
    // session.removeAttribute("statusProvider");
    log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
Also used : CancelledException(edu.stanford.muse.exceptions.CancelledException) EmailDocument(edu.stanford.muse.index.EmailDocument) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) BlobStore(edu.stanford.muse.datacache.BlobStore) Pair(edu.stanford.muse.util.Pair)

Aggregations

EmailDocument (edu.stanford.muse.index.EmailDocument)18 Pair (edu.stanford.muse.util.Pair)7 Document (edu.stanford.muse.index.Document)6 Archive (edu.stanford.muse.index.Archive)4 JSONArray (org.json.JSONArray)4 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)3 NEType (edu.stanford.muse.ner.model.NEType)3 SimpleDateFormat (java.text.SimpleDateFormat)3 java.util (java.util)3 Collectors (java.util.stream.Collectors)3 Address (javax.mail.Address)3 InternetAddress (javax.mail.internet.InternetAddress)3 LogManager (org.apache.logging.log4j.LogManager)3 Logger (org.apache.logging.log4j.Logger)3 Contact (edu.stanford.muse.AddressBookManager.Contact)2 Blob (edu.stanford.muse.datacache.Blob)2 BlobStore (edu.stanford.muse.datacache.BlobStore)2 CancelledException (edu.stanford.muse.exceptions.CancelledException)2 ArchiveReaderWriter (edu.stanford.muse.index.ArchiveReaderWriter)2 EmailUtils (edu.stanford.muse.util.EmailUtils)2