use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class MuseEmailFetcher method fetchAndIndexEmails.
/**
* key method to fetch actual email messages. can take a long time.
* @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
* @param selectedFolders is in the format <account name>^-^<folder name>
* @param session is used only to put a status object in. can be null in which case status object is not set.
* emailDocs, addressBook and blobstore
* @throws NoDefaultFolderException
*/
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session) throws MessagingException, InterruptedException, IOException, JSONException, NoDefaultFolderException, CancelledException {
setupFetchers(-1);
long startTime = System.currentTimeMillis();
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
boolean op_cancelled = false, out_of_mem = false;
BlobStore attachmentsStore = archive.getBlobStore();
fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
if (Util.nullOrEmpty(fetchers)) {
log.warn("Trying to fetch email with no fetchers, setup not called ?");
return;
}
setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
// one fetcher will aggregate everything
FetchStats stats = new FetchStats();
MTEmailFetcher aggregatingFetcher = null;
// a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
long startTimeMillis = System.currentTimeMillis();
for (MTEmailFetcher fetcher : fetchers) {
if (session != null)
session.setAttribute("statusProvider", fetcher);
fetcher.setArchive(archive);
fetcher.setFetchConfig(fetchConfig);
log.info("Memory status before fetching emails: " + Util.getMemoryStats());
// this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
// but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
if (fetcher.isCancelled()) {
log.info("NOTE: fetcher operation was cancelled");
op_cancelled = true;
break;
}
if (fetcher.mayHaveRunOutOfMemory()) {
log.warn("Fetcher operation ran out of memory " + fetcher);
out_of_mem = true;
break;
}
fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
// first non-empty fetcher
aggregatingFetcher = fetcher;
if (aggregatingFetcher != null)
aggregatingFetcher.merge(fetcher);
// add the indexed folders to the stats
EmailStore store = fetcher.getStore();
String fetcherDescription = store.displayName + ":" + store.emailAddress;
for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
}
if (op_cancelled)
throw new CancelledException();
if (out_of_mem)
throw new OutOfMemoryError();
if (aggregatingFetcher != null) {
stats.importStats = aggregatingFetcher.stats;
if (aggregatingFetcher.mayHaveRunOutOfMemory())
throw new OutOfMemoryError();
}
// save memory
aggregatingFetcher = null;
long endTimeMillis = System.currentTimeMillis();
long elapsedMillis = endTimeMillis - startTimeMillis;
log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
// note: this is all archive docs, not just the ones that may have been just imported
List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
archive.addFetchedFolderInfos(fetchedFolderInfos);
if (allEmailDocs.size() == 0)
log.warn("0 messages from email fetcher");
EmailUtils.cleanDates(allEmailDocs);
// create a new address book
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
log.info("Address book stats: " + addressBook.getStats());
if (session != null)
session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
archive.setAddressBook(addressBook);
// we shouldn't really have dups now because the archive ensures that only unique docs are added
// move sorting to archive.postprocess?
EmailUtils.removeDupsAndSort(allEmailDocs);
// report stats
stats.lastUpdate = new Date().getTime();
// (String) JSPHelper.getSessionAttribute(session, "userKey");
stats.userKey = "USER KEY UNUSED";
stats.fetchAndIndexTimeMillis = elapsedMillis;
updateStats(archive, addressBook, stats);
if (session != null)
session.removeAttribute("statusProvider");
log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class SearchResult method filterForAttachmentEntities.
/**
******************************ATTACHMENT SPECIFIC FILTERS************************************
*/
/**
* returns only those docs with attachments matching params[attachmentEntity]
* (this field is or-delimiter separated)
* Todo: review usage of this and BlobStore.getKeywordsForBlob()
*/
private static SearchResult filterForAttachmentEntities(SearchResult inputSet) {
String val = JSPHelper.getParam(inputSet.queryParams, "attachmentEntity");
if (Util.nullOrEmpty(val))
return inputSet;
val = val.toLowerCase();
Set<String> entities = Util.splitFieldForOr(val);
BlobStore blobStore = inputSet.archive.blobStore;
Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
inputSet.matchedDocs.keySet().stream().forEach((Document k) -> {
EmailDocument ed = (EmailDocument) k;
// Here.. check for all attachments of ed for match.
Collection<Blob> blobs = ed.attachments;
Set<Blob> matchedBlobs = new HashSet<>();
for (Blob blob : blobs) {
Collection<String> keywords = blobStore.getKeywordsForBlob(blob);
if (keywords != null) {
keywords.retainAll(entities);
if (// it means this blob is of interest, add it to matchedBlobs.
keywords.size() > 0)
matchedBlobs.add(blob);
}
}
// of this document
if (matchedBlobs.size() != 0) {
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
attachmentHLInfo.addMultipleInfo(matchedBlobs);
outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
}
});
return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class EmailRenderer method htmlForDocument.
/**
* returns a string for documents - in message browsing screen.
*
* @param
* @throws Exception
*/
// TODO: inFull, debug params can be removed
// TODO: Consider a HighlighterOptions class
public static Pair<String, Boolean> htmlForDocument(Document d, SearchResult searchResult, String datasetTitle, Map<String, Map<String, Short>> authorisedEntities, boolean IA_links, boolean inFull, boolean debug, String archiveID) throws Exception {
JSPHelper.log.debug("Generating HTML for document: " + d);
EmailDocument ed = null;
Archive archive = searchResult.getArchive();
String html = null;
boolean overflow = false;
if (d instanceof EmailDocument) {
// for email docs, 1 doc = 1 page
ed = (EmailDocument) d;
List<Blob> highlightAttachments = searchResult.getAttachmentHighlightInformation(d);
StringBuilder page = new StringBuilder();
page.append("<div class=\"muse-doc\">\n");
page.append("<div class=\"muse-doc-header\">\n");
page.append(EmailRenderer.getHTMLForHeader(ed, searchResult, IA_links, debug));
// muse-doc-header
page.append("</div>");
/*
* Map<String, List<String>> sentimentMap =
* indexer.getSentiments(ed); for (String emotion:
* sentimentMap.keySet()) { page.append ("<b>" + emotion +
* "</b>: "); for (String word: sentimentMap.get(emotion))
* page.append (word + " "); page.append ("<br/>\n");
* page.append("<br/>\n"); }
*/
// get highlight terms from searchResult object for this document.
Set<String> highlightTerms = searchResult.getHLInfoTerms(ed);
page.append("\n<div class=\"muse-doc-body\">\n");
Pair<StringBuilder, Boolean> contentsHtml = archive.getHTMLForContents(d, ((EmailDocument) d).getDate(), d.getUniqueId(), searchResult.getRegexToHighlight(), highlightTerms, authorisedEntities, IA_links, inFull, true);
StringBuilder htmlMessageBody = contentsHtml.first;
overflow = contentsHtml.second;
// page.append(ed.getHTMLForContents(indexer, highlightTermsStemmed,
// highlightTermsUnstemmed, IA_links));
page.append(htmlMessageBody);
// muse-doc-body
page.append("\n</div> <!-- .muse-doc-body -->\n");
// page.append("\n<hr class=\"end-of-browse-contents-line\"/>\n");
List<Blob> attachments = ed.attachments;
if (attachments != null && attachments.size() > 0) {
if (ModeConfig.isPublicMode()) {
page.append(attachments.size() + " attachment" + (attachments.size() == 1 ? "" : "s") + ".");
} else {
page.append("<hr style=\"margin:10px\"/>\n<div class=\"attachments\">\n");
int i = 0;
for (; i < attachments.size(); i++) {
Blob attachment = attachments.get(i);
boolean highlight = highlightAttachments != null && highlightAttachments.contains(attachment);
String css_class = "attachment" + (highlight ? " highlight" : "");
page.append("<div class=\"" + css_class + "\">");
String thumbnailURL = null, attachmentURL = null;
BlobStore attachmentStore = archive.getBlobStore();
boolean is_image = false;
if (attachmentStore != null) {
is_image = Util.is_image_filename(attachmentStore.get_URL_Normalized(attachment));
String contentFileDataStoreURL = attachmentStore.get_URL_Normalized(attachment);
attachmentURL = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLtail(contentFileDataStoreURL);
String tnFileDataStoreURL = attachmentStore.getViewURL(attachment, "tn");
if (tnFileDataStoreURL != null)
thumbnailURL = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLtail(tnFileDataStoreURL);
else {
if (archive.getBlobStore().is_image(attachment))
thumbnailURL = attachmentURL;
else
thumbnailURL = "images/sorry.png";
}
} else {
JSPHelper.log.warn("attachments store is null!");
// no return, soldier on even if attachments unavailable for some reason
}
// toString the filename in any case,
String url = archive.getBlobStore().full_filename_normalized(attachment, false);
// cap to a length of 25, otherwise the attachment name
// overflows the tn
String display = Util.ellipsize(url, 25);
page.append(" " + "<span title=\"" + Util.escapeHTML(url) + "\">" + Util.escapeHTML(display) + "</span> ");
page.append("<br/>");
css_class = "attachment-preview" + (is_image ? " img" : "");
String leader = "<img class=\"" + css_class + "\" ";
// URL is not found
if (thumbnailURL != null && attachmentURL != null) {
// d.hashCode() is just something to identify this
// page/message
page.append("<a rel=\"page" + d.hashCode() + "\" title=\"" + Util.escapeHTML(url) + "\" href=\"" + attachmentURL + "\">");
page.append(leader + "href=\"" + attachmentURL + "\" src=\"" + thumbnailURL + "\"></img>\n");
page.append("<a>\n");
} else {
// page.append
// (" <br/> <br/>Not fetched<br/> <br/> ");
// page.append("<a title=\"" + attachment.filename +
// "\" href=\"" + attachmentURL + "\">");
page.append(leader + "src=\"images/no-attachment.png\"></img>\n");
if (thumbnailURL == null)
JSPHelper.log.info("No thumbnail for " + attachment);
if (attachmentURL == null)
JSPHelper.log.info("No attachment URL for " + attachment);
}
// if cleanedup.notequals(normalized) then normalization happened. Download original file (cleanedupfileURL)
// origina.notequals(normalized) then only name cleanup happened.(originalfilename)
// so the attributes are either only originalfilename or cleanedupfileURL or both.
String cleanedupname = attachmentStore.full_filename_cleanedup(attachment);
String normalizedname = attachmentStore.full_filename_normalized(attachment);
String cleanupurl = attachmentStore.get_URL_Cleanedup(attachment);
boolean isNormalized = !cleanedupname.equals(normalizedname);
boolean isCleanedName = !cleanedupname.equals(attachmentStore.full_filename_original(attachment));
if (isNormalized || isCleanedName) {
String completeurl_cleanup = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLtail(cleanupurl);
page.append("<span class=\"glyphicon glyphicon-info-sign\" id=\"normalizationInfo\" ");
if (isNormalized) {
page.append("data-originalurl=" + "\"" + completeurl_cleanup + "\" ");
page.append("data-originalname=" + "\"" + attachmentStore.full_filename_original(attachment, false) + "\" ");
}
if (isCleanedName) {
page.append("data-originalname=" + "\"" + attachmentStore.full_filename_original(attachment, false) + "\"");
}
page.append("></span>");
}
page.append("</div>");
}
// muse-doc-attachments
page.append("\n</div> <!-- .muse-doc-attachments -->\n");
}
}
// .muse-doc
page.append("\n</div> <!-- .muse-doc -->\n");
html = page.toString();
} else if (d instanceof DatedDocument) {
/*
* DatedDocument dd = (DatedDocument) d; StringBuilder page = new
* StringBuilder();
*
* page.append (dd.getHTMLForHeader()); // directly jam in contents
* page.append ("<div class=\"muse-doc\">\n"); page.append
* (dd.getHTMLForContents(indexer)); // directly jam in contents
* page.append ("\n</div>"); // doc-contents return page.toString();
*/
html = "To be implemented";
} else {
JSPHelper.log.warn("Unsupported Document: " + d.getClass().getName());
html = "";
}
return new Pair<>(html, overflow);
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class JSPHelper method preparedArchive.
/**
* creates a new archive and returns it
*/
public static Archive preparedArchive(Multimap<String, String> paramsMap, String baseDir, List<String> extraOptions) throws IOException {
List<String> list = new ArrayList<>();
if (paramsMap != null) {
if ("yearly".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "period")))
list.add("-yearly");
if (JSPHelper.getParam(paramsMap, "noattachments") != null)
list.add("-noattachments");
// filter params
if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "sentOnly")))
list.add("-sentOnly");
String str = JSPHelper.getParam(paramsMap, "dateRange");
if (str != null && str.length() > 0) {
list.add("-date");
list.add(str);
}
String keywords = JSPHelper.getParam(paramsMap, "keywords");
if (keywords != null && !keywords.equals("")) {
list.add("-keywords");
list.add(keywords);
}
String filter = JSPHelper.getParam(paramsMap, "filter");
if (filter != null && !filter.equals("")) {
list.add("-filter");
list.add(filter);
}
// advanced options
if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "incrementalTFIDF")))
list.add("-incrementalTFIDF");
if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "openNLPNER")))
list.add("-openNLPNER");
if (!"true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "allText")))
list.add("-noalltext");
if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "locationsOnly")))
list.add("-locationsOnly");
if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "orgsOnly")))
list.add("-orgsOnly");
if ("true".equalsIgnoreCase(JSPHelper.getParam(paramsMap, "includeQuotedMessages")))
list.add("-includeQuotedMessages");
String subjWeight = JSPHelper.getParam(paramsMap, "subjectWeight");
if (subjWeight != null) {
list.add("-subjectWeight");
list.add(subjWeight);
}
}
if (!Util.nullOrEmpty(extraOptions))
list.addAll(extraOptions);
String[] s = new String[list.size()];
list.toArray(s);
// careful about the ordering here.. first setup, then read indexer, then run it
Archive archive = Archive.createArchive();
BlobStore blobStore = JSPHelper.preparedBlobStore(baseDir + File.separatorChar + Archive.BAG_DATA_FOLDER);
archive.setup(baseDir, blobStore, s);
log.info("archive setup in " + baseDir);
return archive;
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class Archive method export.
/**
* a fresh archive is created under out_dir. name is the name of the session
* under it. blobs are exported into this archive dir. destructive! but
* should be so only in memory. original files on disk should be unmodified.
*
* @param retainedDocs
* @throws Exception
*/
public synchronized String export(Collection<? extends Document> retainedDocs, Export_Mode export_mode, String out_dir, String name, Consumer<StatusProvider> setStatusProvider) throws Exception {
if (Util.nullOrEmpty(out_dir))
return null;
File dir = new File(out_dir);
if (dir.exists() && dir.isDirectory()) {
log.warn("Overwriting existing directory '" + out_dir + "' (it may already exist)");
FileUtils.deleteDirectory(dir);
} else if (!dir.mkdirs()) {
log.warn("Unable to create directory: " + out_dir);
return null;
}
String statusmsg = export_mode == Export_Mode.EXPORT_APPRAISAL_TO_PROCESSING ? "Exporting to Processing" : (export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY ? "Exporting to Discovery" : "Exporting to Delivery");
boolean exportInPublicMode = export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY;
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Preparing base directory.."));
prepareBaseDir(out_dir);
if (!exportInPublicMode && new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + LEXICONS_SUBDIR));
// copy normalization file if it exists
if (!exportInPublicMode && new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX).exists())
FileUtils.copyFile(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR + File.separator + Archive.BLOBLNORMALIZATIONFILE_SUFFIX));
if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + IMAGES_SUBDIR));
// internal disambiguation cache
if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + FEATURES_SUBDIR));
if (new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME).exists())
FileUtils.copyFile(new File(baseDir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME), new File(out_dir + File.separator + Archive.BAG_DATA_FOLDER + File.separatorChar + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME));
// save the states that may get modified
List<Document> savedAllDocs = allDocs;
LabelManager oldLabelManager = getLabelManager();
// change state of the current archive -temporarily//////////
if (exportInPublicMode) {
// replace description with names;
allDocs = new ArrayList<>(retainedDocs);
replaceDescriptionWithNames(allDocs, this);
// Also replace the attachment information present in EmailDocument Object
redactAttachmentDetailsFromDocs(allDocs, this);
} else {
allDocs = new ArrayList<>(retainedDocs);
}
Set<String> retainedDocIDs = retainedDocs.stream().map(Document::getUniqueId).collect(Collectors.toSet());
LabelManager newLabelManager = getLabelManager().getLabelManagerForExport(retainedDocIDs, export_mode);
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting LabelManager.."));
setLabelManager(newLabelManager);
// copy index and if for public mode, also redact body and remove title
// fields
final boolean redact_body_instead_of_remove = true;
/* Set<String> docIdSet = new LinkedHashSet<>();
for (Document d : allDocs)
docIdSet.add(d.getUniqueId());
final Set<String> retainedDocIds = docIdSet;*/
Indexer.FilterFunctor emailFilter = doc -> {
if (!retainedDocIDs.contains(doc.get("docId")))
return false;
if (exportInPublicMode) {
String text;
if (redact_body_instead_of_remove) {
text = doc.get("body");
}
doc.removeFields("body");
doc.removeFields("body_original");
if (text != null) {
String redacted_text = IndexUtils.retainOnlyNames(text, doc);
doc.add(new Field("body", redacted_text, Indexer.full_ft));
// this uses standard analyzer, not stemming because redacted bodys only have names.
}
String title = doc.get("title");
doc.removeFields("title");
if (title != null) {
String redacted_title = IndexUtils.retainOnlyNames(text, doc);
doc.add(new Field("title", redacted_title, Indexer.full_ft));
}
}
return true;
};
/*
Moveing it at the end- after changing the basedir of the archive. Because addressbook is getting saved
after maskEmailDomain.
if (exportInPublicMode) {
List<Document> docs = this.getAllDocs();
List<EmailDocument> eds = new ArrayList<>();
for (Document doc : docs)
eds.add((EmailDocument) doc);
EmailUtils.maskEmailDomain(eds, this.addressBook);
}
*/
Indexer.FilterFunctor attachmentFilter = doc -> {
if (exportInPublicMode) {
return false;
}
String docId = doc.get("emailDocId");
if (docId == null) {
Integer di = Integer.parseInt(doc.get("docId"));
// don't want to print too many messages
if (di < 10)
log.error("Looks like this is an old archive, filtering all the attachments!!\n" + "Consider re-indexing with the latest version for a proper export.");
return false;
}
return retainedDocIDs.contains(docId);
};
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting Index.."));
indexer.copyDirectoryWithDocFilter(out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER, emailFilter, attachmentFilter);
log.info("Completed exporting indexes");
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting Blobs.."));
// save the blobs in a new blobstore
if (!exportInPublicMode) {
log.info("Starting to export blobs, old blob store is: " + blobStore);
Set<Blob> blobsToKeep = new LinkedHashSet<>();
for (Document d : allDocs) if (d instanceof EmailDocument)
if (!Util.nullOrEmpty(((EmailDocument) d).attachments))
blobsToKeep.addAll(((EmailDocument) d).attachments);
String blobsDir = out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + BLOBS_SUBDIR;
new File(blobsDir).mkdirs();
BlobStore newBlobStore = blobStore.createCopy(blobsDir, blobsToKeep);
log.info("Completed exporting blobs, newBlobStore in dir: " + blobsDir + " is: " + newBlobStore);
// switch to the new blob store (important -- the urls and indexes in the new blob store are different from the old one! */
blobStore = newBlobStore;
}
String oldBaseDir = baseDir;
// change base directory
setBaseDir(out_dir);
if (exportInPublicMode) {
List<Document> docs = this.getAllDocs();
List<EmailDocument> eds = new ArrayList<>();
for (Document doc : docs) eds.add((EmailDocument) doc);
EmailUtils.maskEmailDomain(eds, this.addressBook);
}
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Exporting EntityBook Manager.."));
// now read entitybook manager as well (or build from lucene)
String outdir = out_dir + File.separatorChar + Archive.BAG_DATA_FOLDER + File.separatorChar + Archive.SESSIONS_SUBDIR;
String entityBookPath = outdir + File.separatorChar + Archive.ENTITYBOOKMANAGER_SUFFIX;
EntityBookManager entityBookManager = ArchiveReaderWriter.readEntityBookManager(this, entityBookPath);
this.setEntityBookManager(entityBookManager);
// recompute entity count because some documents have been redacted
double theta = 0.001;
// getEntitiesCountMapModuloThreshold(this,theta);
this.collectionMetadata.entityCounts = this.getEntityBookManager().getEntitiesCountMapModuloThreshold(theta);
// write out the archive file.. note that this is a fresh creation of archive in the exported folder
setStatusProvider.accept(new StaticStatusProvider(statusmsg + ":" + "Export done. Saving Archive.."));
// save .session file.
ArchiveReaderWriter.saveArchive(out_dir, name, this, Save_Archive_Mode.FRESH_CREATION);
log.info("Completed saving archive object");
// restore states
setBaseDir(oldBaseDir);
allDocs = savedAllDocs;
setLabelManager(oldLabelManager);
return out_dir;
}
Aggregations