Search in sources :

Example 1 with BlobStore

use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.

the class MuseEmailFetcher method fetchAndIndexEmails.

/**
 * key method to fetch actual email messages. can take a long time.
 * @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
 * @param selectedFolders is in the format <account name>^-^<folder name>
 * @param session is used only to put a status object in. can be null in which case status object is not set.
 * emailDocs, addressBook and blobstore
 * @throws NoDefaultFolderException
 */
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session) throws MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException {
    setupFetchers(-1);
    long startTime = System.currentTimeMillis();
    if (session != null)
        session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
    boolean op_cancelled = false, out_of_mem = false;
    BlobStore attachmentsStore = archive.getBlobStore();
    fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
    if (Util.nullOrEmpty(fetchers)) {
        log.warn("Trying to fetch email with no fetchers, setup not called ?");
        return;
    }
    setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
    List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
    // one fetcher will aggregate everything
    FetchStats stats = new FetchStats();
    MTEmailFetcher aggregatingFetcher = null;
    // a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
    long startTimeMillis = System.currentTimeMillis();
    for (MTEmailFetcher fetcher : fetchers) {
        if (session != null)
            session.setAttribute("statusProvider", fetcher);
        fetcher.setArchive(archive);
        fetcher.setFetchConfig(fetchConfig);
        log.info("Memory status before fetching emails: " + Util.getMemoryStats());
        // this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
        List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
        // but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
        if (fetcher.isCancelled()) {
            log.info("NOTE: fetcher operation was cancelled");
            op_cancelled = true;
            break;
        }
        if (fetcher.mayHaveRunOutOfMemory()) {
            log.warn("Fetcher operation ran out of memory " + fetcher);
            out_of_mem = true;
            break;
        }
        fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
        if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
            // first non-empty fetcher
            aggregatingFetcher = fetcher;
        if (aggregatingFetcher != null)
            aggregatingFetcher.merge(fetcher);
        // add the indexed folders to the stats
        EmailStore store = fetcher.getStore();
        String fetcherDescription = store.displayName + ":" + store.emailAddress;
        for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
    }
    if (op_cancelled)
        throw new CancelledException();
    if (out_of_mem)
        throw new OutOfMemoryError();
    if (aggregatingFetcher != null) {
        stats.importStats = aggregatingFetcher.stats;
        if (aggregatingFetcher.mayHaveRunOutOfMemory())
            throw new OutOfMemoryError();
    }
    // save memory
    aggregatingFetcher = null;
    long endTimeMillis = System.currentTimeMillis();
    long elapsedMillis = endTimeMillis - startTimeMillis;
    log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
    // note: this is all archive docs, not just the ones that may have been just imported
    List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
    archive.addFetchedFolderInfos(fetchedFolderInfos);
    if (allEmailDocs.size() == 0)
        log.warn("0 messages from email fetcher");
    EmailUtils.cleanDates(allEmailDocs);
    // create a new address book
    if (session != null)
        session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
    AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
    log.info("Address book stats: " + addressBook.getStats());
    if (session != null)
        session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
    archive.setAddressBook(addressBook);
    // we shouldn't really have dups now because the archive ensures that only unique docs are added
    // move sorting to archive.postprocess?
    EmailUtils.removeDupsAndSort(allEmailDocs);
    // report stats
    stats.lastUpdate = new Date().getTime();
    // (String) JSPHelper.getSessionAttribute(session, "userKey");
    stats.userKey = "USER KEY UNUSED";
    stats.fetchAndIndexTimeMillis = elapsedMillis;
    updateStats(archive, addressBook, stats);
    if (session != null)
        session.removeAttribute("statusProvider");
    log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
Also used : CancelledException(edu.stanford.muse.exceptions.CancelledException) EmailDocument(edu.stanford.muse.index.EmailDocument) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) BlobStore(edu.stanford.muse.datacache.BlobStore) Pair(edu.stanford.muse.util.Pair)

Example 2 with BlobStore

use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.

the class SearchResult method filterForAttachmentEntities.

/**
 ******************************ATTACHMENT SPECIFIC FILTERS************************************
 */
/**
 * returns only those docs with attachments matching params[attachmentEntity]
 * (this field is or-delimiter separated)
 * Todo: review usage of this and BlobStore.getKeywordsForBlob()
 */
private static SearchResult filterForAttachmentEntities(SearchResult inputSet) {
    String val = JSPHelper.getParam(inputSet.queryParams, "attachmentEntity");
    if (Util.nullOrEmpty(val))
        return inputSet;
    val = val.toLowerCase();
    Set<String> entities = Util.splitFieldForOr(val);
    BlobStore blobStore = inputSet.archive.blobStore;
    Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
    inputSet.matchedDocs.keySet().stream().forEach((Document k) -> {
        EmailDocument ed = (EmailDocument) k;
        // Here.. check for all attachments of ed for match.
        Collection<Blob> blobs = ed.attachments;
        Set<Blob> matchedBlobs = new HashSet<>();
        for (Blob blob : blobs) {
            Collection<String> keywords = blobStore.getKeywordsForBlob(blob);
            if (keywords != null) {
                keywords.retainAll(entities);
                if (// it means this blob is of interest, add it to matchedBlobs.
                keywords.size() > 0)
                    matchedBlobs.add(blob);
            }
        }
        // of this document
        if (matchedBlobs.size() != 0) {
            BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
            AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
            attachmentHLInfo.addMultipleInfo(matchedBlobs);
            outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
        }
    });
    return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
Also used : Blob(edu.stanford.muse.datacache.Blob) BlobStore(edu.stanford.muse.datacache.BlobStore) Pair(edu.stanford.muse.util.Pair)

Example 3 with BlobStore

use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.

the class EmailRenderer method htmlForDocument.

/**
 * returns a string for documents - in message browsing screen.
 *
 * @param
 * @throws Exception
 */
// TODO: inFull, debug params can be removed
// TODO: Consider a HighlighterOptions class
public static Pair<String, Boolean> htmlForDocument(Document d, SearchResult searchResult, String datasetTitle, Map<String, Map<String, Short>> authorisedEntities, boolean IA_links, boolean inFull, boolean debug, String archiveID) throws Exception {
    JSPHelper.log.debug("Generating HTML for document: " + d);
    EmailDocument ed = null;
    Archive archive = searchResult.getArchive();
    String html = null;
    boolean overflow = false;
    if (d instanceof EmailDocument) {
        // for email docs, 1 doc = 1 page
        ed = (EmailDocument) d;
        List<Blob> highlightAttachments = searchResult.getAttachmentHighlightInformation(d);
        StringBuilder page = new StringBuilder();
        page.append("<div class=\"muse-doc\">\n");
        page.append("<div class=\"muse-doc-header\">\n");
        page.append(EmailRenderer.getHTMLForHeader(ed, searchResult, IA_links, debug));
        // muse-doc-header
        page.append("</div>");
        /*
			 * Map<String, List<String>> sentimentMap =
			 * indexer.getSentiments(ed); for (String emotion:
			 * sentimentMap.keySet()) { page.append ("<b>" + emotion +
			 * "</b>: "); for (String word: sentimentMap.get(emotion))
			 * page.append (word + " "); page.append ("<br/>\n");
			 * page.append("<br/>\n"); }
			 */
        // get highlight terms from searchResult object for this document.
        Set<String> highlightTerms = searchResult.getHLInfoTerms(ed);
        page.append("\n<div class=\"muse-doc-body\">\n");
        Pair<StringBuilder, Boolean> contentsHtml = archive.getHTMLForContents(d, ((EmailDocument) d).getDate(), d.getUniqueId(), searchResult.getRegexToHighlight(), highlightTerms, authorisedEntities, IA_links, inFull, true);
        StringBuilder htmlMessageBody = contentsHtml.first;
        overflow = contentsHtml.second;
        // page.append(ed.getHTMLForContents(indexer, highlightTermsStemmed,
        // highlightTermsUnstemmed, IA_links));
        page.append(htmlMessageBody);
        // muse-doc-body
        page.append("\n</div> <!-- .muse-doc-body -->\n");
        // page.append("\n<hr class=\"end-of-browse-contents-line\"/>\n");
        List<Blob> attachments = ed.attachments;
        if (attachments != null && attachments.size() > 0) {
            if (ModeConfig.isPublicMode()) {
                page.append(attachments.size() + " attachment" + (attachments.size() == 1 ? "" : "s") + ".");
            } else {
                page.append("<hr style=\"margin:10px\"/>\n<div class=\"attachments\">\n");
                int i = 0;
                for (; i < attachments.size(); i++) {
                    Blob attachment = attachments.get(i);
                    boolean highlight = highlightAttachments != null && highlightAttachments.contains(attachment);
                    String css_class = "attachment" + (highlight ? " highlight" : "");
                    page.append("<div class=\"" + css_class + "\">");
                    String thumbnailURL = null, attachmentURL = null;
                    BlobStore attachmentStore = archive.getBlobStore();
                    boolean is_image = false;
                    if (attachmentStore != null) {
                        is_image = Util.is_image_filename(attachmentStore.get_URL_Normalized(attachment));
                        String contentFileDataStoreURL = attachmentStore.get_URL_Normalized(attachment);
                        attachmentURL = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLtail(contentFileDataStoreURL);
                        String tnFileDataStoreURL = attachmentStore.getViewURL(attachment, "tn");
                        if (tnFileDataStoreURL != null)
                            thumbnailURL = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLtail(tnFileDataStoreURL);
                        else {
                            if (archive.getBlobStore().is_image(attachment))
                                thumbnailURL = attachmentURL;
                            else
                                thumbnailURL = "images/sorry.png";
                        }
                    } else {
                        JSPHelper.log.warn("attachments store is null!");
                    // no return, soldier on even if attachments unavailable for some reason
                    }
                    // toString the filename in any case,
                    String url = archive.getBlobStore().full_filename_normalized(attachment, false);
                    // cap to a length of 25, otherwise the attachment name
                    // overflows the tn
                    String display = Util.ellipsize(url, 25);
                    page.append("&nbsp;" + "<span title=\"" + Util.escapeHTML(url) + "\">" + Util.escapeHTML(display) + "</span>&nbsp;");
                    page.append("<br/>");
                    css_class = "attachment-preview" + (is_image ? " img" : "");
                    String leader = "<img class=\"" + css_class + "\" ";
                    // URL is not found
                    if (thumbnailURL != null && attachmentURL != null) {
                        // d.hashCode() is just something to identify this
                        // page/message
                        page.append("<a rel=\"page" + d.hashCode() + "\" title=\"" + Util.escapeHTML(url) + "\" href=\"" + attachmentURL + "\">");
                        page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\"></img>\n");
                        page.append("<a>\n");
                    } else {
                        // page.append
                        // ("&nbsp;<br/>&nbsp;<br/>Not fetched<br/>&nbsp;<br/>&nbsp;&nbsp;&nbsp;");
                        // page.append("<a title=\"" + attachment.filename +
                        // "\" href=\"" + attachmentURL + "\">");
                        page.append(leader + "src=\"images/no-attachment.png\"></img>\n");
                        if (thumbnailURL == null)
                            JSPHelper.log.info("No thumbnail for " + attachment);
                        if (attachmentURL == null)
                            JSPHelper.log.info("No attachment URL for " + attachment);
                    }
                    // if cleanedup.notequals(normalized) then normalization happened. Download original file (cleanedupfileURL)
                    // origina.notequals(normalized) then only name cleanup happened.(originalfilename)
                    // so the attributes are either only originalfilename or cleanedupfileURL or both.
                    String cleanedupname = attachmentStore.full_filename_cleanedup(attachment);
                    String normalizedname = attachmentStore.full_filename_normalized(attachment);
                    String cleanupurl = attachmentStore.get_URL_Cleanedup(attachment);
                    boolean isNormalized = !cleanedupname.equals(normalizedname);
                    boolean isCleanedName = !cleanedupname.equals(attachmentStore.full_filename_original(attachment));
                    if (isNormalized || isCleanedName) {
                        String completeurl_cleanup = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLtail(cleanupurl);
                        page.append("<span class=\"glyphicon glyphicon-info-sign\" id=\"normalizationInfo\" ");
                        if (isNormalized) {
                            page.append("data-originalurl=" + "\"" + completeurl_cleanup + "\" ");
                            page.append("data-originalname=" + "\"" + attachmentStore.full_filename_original(attachment, false) + "\" ");
                        }
                        if (isCleanedName) {
                            page.append("data-originalname=" + "\"" + attachmentStore.full_filename_original(attachment, false) + "\"");
                        }
                        page.append("></span>");
                    }
                    page.append("</div>");
                }
                // muse-doc-attachments
                page.append("\n</div>  <!-- .muse-doc-attachments -->\n");
            }
        }
        // .muse-doc
        page.append("\n</div>  <!-- .muse-doc -->\n");
        html = page.toString();
    } else if (d instanceof DatedDocument) {
        /*
			 * DatedDocument dd = (DatedDocument) d; StringBuilder page = new
			 * StringBuilder();
			 *
			 * page.append (dd.getHTMLForHeader()); // directly jam in contents
			 * page.append ("<div class=\"muse-doc\">\n"); page.append
			 * (dd.getHTMLForContents(indexer)); // directly jam in contents
			 * page.append ("\n</div>"); // doc-contents return page.toString();
			 */
        html = "To be implemented";
    } else {
        JSPHelper.log.warn("Unsupported Document: " + d.getClass().getName());
        html = "";
    }
    return new Pair<>(html, overflow);
}
Also used : Blob(edu.stanford.muse.datacache.Blob) BlobStore(edu.stanford.muse.datacache.BlobStore) Pair(edu.stanford.muse.util.Pair)

Example 4 with BlobStore

use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.

the class JSPHelper method preparedArchive.

/**
 * creates a new archive and returns it
 */
public static Archive preparedArchive(Multimap<String, String> paramsMap, String baseDir, List<String> extraOptions) throws IOException {
    List<String> list = new ArrayList<>();
    if (paramsMap != null) {
        if ("yearly".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "period")))
            list.add("-yearly");
        if (JSPHelper.getParam(paramsMap, "noattachments") != null)
            list.add("-noattachments");
        // filter params
        if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "sentOnly")))
            list.add("-sentOnly");
        String str = JSPHelper.getParam(paramsMap, "dateRange");
        if (str != null && str.length() > 0) {
            list.add("-date");
            list.add(str);
        }
        String keywords = JSPHelper.getParam(paramsMap, "keywords");
        if (keywords != null && !keywords.equals("")) {
            list.add("-keywords");
            list.add(keywords);
        }
        String filter = JSPHelper.getParam(paramsMap, "filter");
        if (filter != null && !filter.equals("")) {
            list.add("-filter");
            list.add(filter);
        }
        // advanced options
        if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "incrementalTFIDF")))
            list.add("-incrementalTFIDF");
        if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "openNLPNER")))
            list.add("-openNLPNER");
        if (!"true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "allText")))
            list.add("-noalltext");
        if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "locationsOnly")))
            list.add("-locationsOnly");
        if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "orgsOnly")))
            list.add("-orgsOnly");
        if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "includeQuotedMessages")))
            list.add("-includeQuotedMessages");
        String subjWeight = JSPHelper.getParam(paramsMap, "subjectWeight");
        if (subjWeight != null) {
            list.add("-subjectWeight");
            list.add(subjWeight);
        }
    }
    if (!Util.nullOrEmpty(extraOptions))
        list.addAll(extraOptions);
    String[] s = new String[list.size()];
    list.toArray(s);
    // careful about the ordering here.. first setup, then read indexer, then run it
    Archive archive = Archive.createArchive();
    BlobStore blobStore = JSPHelper.preparedBlobStore(baseDir + File.separatorChar + Archive.BAG_DATA_FOLDER);
    archive.setup(baseDir, blobStore, s);
    log.info("archive setup in " + baseDir);
    return archive;
}
Also used : BlobStore(edu.stanford.muse.datacache.BlobStore)

Example 5 with BlobStore

use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.

the class Archive method export.

/**
 * a fresh archive is created under out_dir. name is the name of the session
 * under it. blobs are exported into this archive dir. destructive! but
 * should be so only in memory. original files on disk should be unmodified.
 *
 * @param retainedDocs
 * @throws Exception
 */
public synchronized String export(Collection<? extends Document> retainedDocs, Export_Mode export_mode, String out_dir, String name, Consumer<StatusProvider> setStatusProvider) throws Exception {
    if (Util.nullOrEmpty(out_dir))
        return null;
    File dir = new File(out_dir);
    if (dir.exists() && dir.isDirectory()) {
        log.warn("Overwriting existing directory '" + out_dir + "' (it may already exist)");
        FileUtils.deleteDirectory(dir);
    } else if (!dir.mkdirs()) {
        log.warn("Unable to create directory: " + out_dir);
        return null;
    }
    String statusmsg = export_mode == Export_Mode.EXPORT_APPRAISAL_TO_PROCESSING ? "Exporting to Processing" : (export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY ? "Exporting to Discovery" : "Exporting to Delivery");
    boolean exportInPublicMode = export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY;
    setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Preparing base directory.."));
    prepareBaseDir(out_dir);
    if (!exportInPublicMode && new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR).exists())
        FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR));
    // copy normalization file if it exists
    if (!exportInPublicMode && new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX).exists())
        FileUtils.copyFile(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX));
    if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR).exists())
        FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR));
    // internal disambiguation cache
    if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR).exists())
        FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR));
    if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME).exists())
        FileUtils.copyFile(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME));
    // save the states that may get modified
    List<Document> savedAllDocs = allDocs;
    LabelManager oldLabelManager = getLabelManager();
    // change state of the current archive -temporarily//////////
    if (exportInPublicMode) {
        // replace description with names;
        allDocs = new ArrayList<>(retainedDocs);
        replaceDescriptionWithNames(allDocs, this);
        // Also replace the attachment information present in EmailDocument Object
        redactAttachmentDetailsFromDocs(allDocs, this);
    } else {
        allDocs = new ArrayList<>(retainedDocs);
    }
    Set<String> retainedDocIDs = retainedDocs.stream().map(Document::getUniqueId).collect(Collectors.toSet());
    LabelManager newLabelManager = getLabelManager().getLabelManagerForExport(retainedDocIDs, export_mode);
    setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting LabelManager.."));
    setLabelManager(newLabelManager);
    // copy index and if for public mode, also redact body and remove title
    // fields
    final boolean redact_body_instead_of_remove = true;
    /* Set<String> docIdSet = new LinkedHashSet<>();
        for (Document d : allDocs)
            docIdSet.add(d.getUniqueId());
        final Set<String> retainedDocIds = docIdSet;*/
    Indexer.FilterFunctor emailFilter = doc -> {
        if (!retainedDocIDs.contains(doc.get("docId")))
            return false;
        if (exportInPublicMode) {
            String text;
            if (redact_body_instead_of_remove) {
                text = doc.get("body");
            }
            doc.removeFields("body");
            doc.removeFields("body_original");
            if (text != null) {
                String redacted_text = IndexUtils.retainOnlyNames(text, doc);
                doc.add(new Field("body", redacted_text, Indexer.full_ft));
            // this uses standard analyzer, not stemming because redacted bodys only have names.
            }
            String title = doc.get("title");
            doc.removeFields("title");
            if (title != null) {
                String redacted_title = IndexUtils.retainOnlyNames(text, doc);
                doc.add(new Field("title", redacted_title, Indexer.full_ft));
            }
        }
        return true;
    };
    /*
Moveing it at the end- after changing the basedir of the archive. Because addressbook is getting saved
after maskEmailDomain.
        if (exportInPublicMode) {
            List<Document> docs = this.getAllDocs();
            List<EmailDocument> eds = new ArrayList<>();
            for (Document doc : docs)
                eds.add((EmailDocument) doc);

            EmailUtils.maskEmailDomain(eds, this.addressBook);
        }
*/
    Indexer.FilterFunctor attachmentFilter = doc -> {
        if (exportInPublicMode) {
            return false;
        }
        String docId = doc.get("emailDocId");
        if (docId == null) {
            Integer di = Integer.parseInt(doc.get("docId"));
            // don't want to print too many messages
            if (di < 10)
                log.error("Looks like this is an old archive, filtering all the attachments!!\n" + "Consider re-indexing with the latest version for a proper export.");
            return false;
        }
        return retainedDocIDs.contains(docId);
    };
    setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting Index.."));
    indexer.copyDirectoryWithDocFilter(out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER, emailFilter, attachmentFilter);
    log.info("Completed exporting indexes");
    setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting Blobs.."));
    // save the blobs in a new blobstore
    if (!exportInPublicMode) {
        log.info("Starting to export blobs, old blob store is: " + blobStore);
        Set<Blob> blobsToKeep = new LinkedHashSet<>();
        for (Document d : allDocs) if (d instanceof EmailDocument)
            if (!Util.nullOrEmpty(((EmailDocument) d).attachments))
                blobsToKeep.addAll(((EmailDocument) d).attachments);
        String blobsDir = out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + BLOBS_SUBDIR;
        new File(blobsDir).mkdirs();
        BlobStore newBlobStore = blobStore.createCopy(blobsDir, blobsToKeep);
        log.info("Completed exporting blobs, newBlobStore in dir: " + blobsDir + " is: " + newBlobStore);
        // switch to the new blob store (important -- the urls and indexes in the new blob store are different from the old one! */
        blobStore = newBlobStore;
    }
    String oldBaseDir = baseDir;
    // change base directory
    setBaseDir(out_dir);
    if (exportInPublicMode) {
        List<Document> docs = this.getAllDocs();
        List<EmailDocument> eds = new ArrayList<>();
        for (Document doc : docs) eds.add((EmailDocument) doc);
        EmailUtils.maskEmailDomain(eds, this.addressBook);
    }
    setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting EntityBook Manager.."));
    // now read entitybook manager as well (or build from lucene)
    String outdir = out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR;
    String entityBookPath = outdir + File.separatorChar + Archive.ENTITYBOOKMANAGER_SUFFIX;
    EntityBookManager entityBookManager = ArchiveReaderWriter.readEntityBookManager(this, entityBookPath);
    this.setEntityBookManager(entityBookManager);
    // recompute entity count because some documents have been redacted
    double theta = 0.001;
    // getEntitiesCountMapModuloThreshold(this,theta);
    this.collectionMetadata.entityCounts = this.getEntityBookManager().getEntitiesCountMapModuloThreshold(theta);
    // write out the archive file.. note that this is a fresh creation of archive in the exported folder
    setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Export done. Saving Archive.."));
    // save .session file.
    ArchiveReaderWriter.saveArchive(out_dir, name, this, Save_Archive_Mode.FRESH_CREATION);
    log.info("Completed saving archive object");
    // restore states
    setBaseDir(oldBaseDir);
    allDocs = savedAllDocs;
    setLabelManager(oldLabelManager);
    return out_dir;
}
Also used : edu.stanford.muse.util(edu.stanford.muse.util) ParseException(org.apache.lucene.queryparser.classic.ParseException) Config(edu.stanford.muse.Config) java.util(java.util) Getter(lombok.Getter) ResultCache(edu.stanford.muse.ResultCacheManager.ResultCache) Blob(edu.stanford.muse.datacache.Blob) MessageDigest(java.security.MessageDigest) AnnotationManager(edu.stanford.muse.AnnotationManager.AnnotationManager) NameInfo(edu.stanford.muse.ie.NameInfo) SimpleDateFormat(java.text.SimpleDateFormat) Multimap(com.google.common.collect.Multimap) StandardSupportedAlgorithms(gov.loc.repository.bagit.hash.StandardSupportedAlgorithms) java.nio.file(java.nio.file) LabelManager(edu.stanford.muse.LabelManager.LabelManager) Tuple2(groovy.lang.Tuple2) Gson(com.google.gson.Gson) edu.stanford.muse.email(edu.stanford.muse.email) gov.loc.repository.bagit.exceptions(gov.loc.repository.bagit.exceptions) MetadataWriter(gov.loc.repository.bagit.writer.MetadataWriter) CorrespondentAuthorityMapper(edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper) LinkedHashMultimap(com.google.common.collect.LinkedHashMultimap) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) EmailRenderer(edu.stanford.muse.webapp.EmailRenderer) BlobStore(edu.stanford.muse.datacache.BlobStore) PathUtils(gov.loc.repository.bagit.util.PathUtils) AddressBook(edu.stanford.muse.AddressBookManager.AddressBook) DateTime(org.joda.time.DateTime) FileUtils(org.apache.commons.io.FileUtils) Label(edu.stanford.muse.LabelManager.Label) ManifestWriter(gov.loc.repository.bagit.writer.ManifestWriter) Collectors(java.util.stream.Collectors) Consumer(java.util.function.Consumer) Logger(org.apache.logging.log4j.Logger) java.io(java.io) NER(edu.stanford.muse.ner.NER) gov.loc.repository.bagit.creator(gov.loc.repository.bagit.creator) Field(org.apache.lucene.document.Field) EntityBookManager(edu.stanford.muse.ie.variants.EntityBookManager) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) Header(javax.mail.Header) NEType(edu.stanford.muse.ner.model.NEType) BagReader(gov.loc.repository.bagit.reader.BagReader) ModeConfig(edu.stanford.muse.webapp.ModeConfig) LogManager(org.apache.logging.log4j.LogManager) gov.loc.repository.bagit.domain(gov.loc.repository.bagit.domain) JSONArray(org.json.JSONArray) Blob(edu.stanford.muse.datacache.Blob) EntityBookManager(edu.stanford.muse.ie.variants.EntityBookManager) Field(org.apache.lucene.document.Field) LabelManager(edu.stanford.muse.LabelManager.LabelManager) BlobStore(edu.stanford.muse.datacache.BlobStore)

Aggregations

BlobStore (edu.stanford.muse.datacache.BlobStore)10 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)4 Blob (edu.stanford.muse.datacache.Blob)4 Pair (edu.stanford.muse.util.Pair)4 LinkedHashMultimap (com.google.common.collect.LinkedHashMultimap)2 Multimap (com.google.common.collect.Multimap)2 Gson (com.google.gson.Gson)2 CorrespondentAuthorityMapper (edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper)2 AnnotationManager (edu.stanford.muse.AnnotationManager.AnnotationManager)2 Config (edu.stanford.muse.Config)2 Label (edu.stanford.muse.LabelManager.Label)2 LabelManager (edu.stanford.muse.LabelManager.LabelManager)2 edu.stanford.muse.email (edu.stanford.muse.email)2 CancelledException (edu.stanford.muse.exceptions.CancelledException)2 NameInfo (edu.stanford.muse.ie.NameInfo)2 EmailDocument (edu.stanford.muse.index.EmailDocument)2 NER (edu.stanford.muse.ner.NER)2 NEType (edu.stanford.muse.ner.model.NEType)2 edu.stanford.muse.util (edu.stanford.muse.util)2 EmailRenderer (edu.stanford.muse.webapp.EmailRenderer)2