use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class Archive method export.
/**
* a fresh archive is created under out_dir. name is the name of the session
* under it. blobs are exported into this archive dir. destructive! but
* should be so only in memory. original files on disk should be unmodified.
*
* @param retainedDocs
* @throws Exception
*/
public synchronized String export(Collection<? extends Document> retainedDocs, Export_Mode export_mode, String out_dir, String name) throws Exception {
if (Util.nullOrEmpty(out_dir))
return null;
File dir = new File(out_dir);
if (dir.exists() && dir.isDirectory()) {
log.warn("Overwriting existing directory '" + out_dir + "' (it may already exist)");
FileUtils.deleteDirectory(dir);
} else if (!dir.mkdirs()) {
log.warn("Unable to create directory: " + out_dir);
return null;
}
boolean exportInPublicMode = export_mode == Export_Mode.EXPORT_PROCESSING_TO_DISCOVERY;
Archive.prepareBaseDir(out_dir);
if (!exportInPublicMode && new File(baseDir + File.separator + LEXICONS_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + LEXICONS_SUBDIR), new File(out_dir + File.separator + LEXICONS_SUBDIR));
if (new File(baseDir + File.separator + IMAGES_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + IMAGES_SUBDIR), new File(out_dir + File.separator + IMAGES_SUBDIR));
// internal disambiguation cache
if (new File(baseDir + File.separator + FEATURES_SUBDIR).exists())
FileUtils.copyDirectory(new File(baseDir + File.separator + FEATURES_SUBDIR), new File(out_dir + File.separator + FEATURES_SUBDIR));
if (new File(baseDir + File.separator + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME).exists())
FileUtils.copyFile(new File(baseDir + File.separator + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME), new File(out_dir + File.separator + edu.stanford.muse.Config.AUTHORITY_ASSIGNER_FILENAME));
// save the states that may get modified
List<Document> savedAllDocs = allDocs;
LabelManager oldLabelManager = getLabelManager();
// change state of the current archive -temporarily//////////
if (exportInPublicMode) {
// replace description with names;
replaceDescriptionWithNames(allDocs, this);
} else {
allDocs = new ArrayList<>(retainedDocs);
}
Set<String> retainedDocIDs = retainedDocs.stream().map(Document::getUniqueId).collect(Collectors.toSet());
LabelManager newLabelManager = getLabelManager().getLabelManagerForExport(retainedDocIDs, export_mode);
setLabelManager(newLabelManager);
// copy index and if for public mode, also redact body and remove title
// fields
final boolean redact_body_instead_of_remove = true;
Set<String> docIdSet = new LinkedHashSet<>();
for (Document d : allDocs) docIdSet.add(d.getUniqueId());
final Set<String> retainedDocIds = docIdSet;
Indexer.FilterFunctor emailFilter = doc -> {
if (!retainedDocIds.contains(doc.get("docId")))
return false;
if (exportInPublicMode) {
String text;
if (redact_body_instead_of_remove) {
text = doc.get("body");
}
doc.removeFields("body");
doc.removeFields("body_original");
if (text != null) {
String redacted_text = IndexUtils.retainOnlyNames(text, doc);
doc.add(new Field("body", redacted_text, Indexer.full_ft));
// this uses standard analyzer, not stemming because redacted bodys only have names.
}
String title = doc.get("title");
doc.removeFields("title");
if (title != null) {
String redacted_title = IndexUtils.retainOnlyNames(text, doc);
doc.add(new Field("title", redacted_title, Indexer.full_ft));
}
}
return true;
};
/*
Moveing it at the end- after changing the basedir of the archive. Because addressbook is getting saved
after maskEmailDomain.
if (exportInPublicMode) {
List<Document> docs = this.getAllDocs();
List<EmailDocument> eds = new ArrayList<>();
for (Document doc : docs)
eds.add((EmailDocument) doc);
EmailUtils.maskEmailDomain(eds, this.addressBook);
}
*/
Indexer.FilterFunctor attachmentFilter = doc -> {
if (exportInPublicMode) {
return false;
}
String docId = doc.get("emailDocId");
if (docId == null) {
Integer di = Integer.parseInt(doc.get("docId"));
// don't want to print too many messages
if (di < 10)
log.error("Looks like this is an old archive, filtering all the attachments!!\n" + "Consider re-indexing with the latest version for a proper export.");
return false;
}
return retainedDocIds.contains(docId);
};
indexer.copyDirectoryWithDocFilter(out_dir, emailFilter, attachmentFilter);
log.info("Completed exporting indexes");
// save the blobs in a new blobstore
if (!exportInPublicMode) {
log.info("Starting to export blobs, old blob store is: " + blobStore);
Set<Blob> blobsToKeep = new LinkedHashSet<>();
for (Document d : allDocs) if (d instanceof EmailDocument)
if (!Util.nullOrEmpty(((EmailDocument) d).attachments))
blobsToKeep.addAll(((EmailDocument) d).attachments);
String blobsDir = out_dir + File.separatorChar + BLOBS_SUBDIR;
new File(blobsDir).mkdirs();
BlobStore newBlobStore = blobStore.createCopy(blobsDir, blobsToKeep);
log.info("Completed exporting blobs, newBlobStore in dir: " + blobsDir + " is: " + newBlobStore);
// switch to the new blob store (important -- the urls and indexes in the new blob store are different from the old one! */
blobStore = newBlobStore;
}
String oldBaseDir = baseDir;
// change base directory
setBaseDir(out_dir);
if (exportInPublicMode) {
List<Document> docs = this.getAllDocs();
List<EmailDocument> eds = new ArrayList<>();
for (Document doc : docs) eds.add((EmailDocument) doc);
EmailUtils.maskEmailDomain(eds, this.addressBook);
}
// write out the archive file
// save .session file.
SimpleSessions.saveArchive(out_dir, name, this);
log.info("Completed saving archive object");
// restore states
setBaseDir(oldBaseDir);
allDocs = savedAllDocs;
setLabelManager(oldLabelManager);
return out_dir;
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class JSPHelper method preparedArchive.
/**
* creates a new archive and returns it
*/
public static Archive preparedArchive(HttpServletRequest request, String baseDir, List<String> extraOptions) throws IOException {
List<String> list = new ArrayList<>();
if (request != null) {
if ("yearly".equalsIgnoreCase(request.getParameter("period")))
list.add("-yearly");
if (request.getParameter("noattachments") != null)
list.add("-noattachments");
// filter params
if ("true".equalsIgnoreCase(request.getParameter("sentOnly")))
list.add("-sentOnly");
String str = request.getParameter("dateRange");
if (str != null && str.length() > 0) {
list.add("-date");
list.add(str);
}
String keywords = request.getParameter("keywords");
if (keywords != null && !keywords.equals("")) {
list.add("-keywords");
list.add(keywords);
}
String filter = request.getParameter("filter");
if (filter != null && !filter.equals("")) {
list.add("-filter");
list.add(filter);
}
// advanced options
if ("true".equalsIgnoreCase(request.getParameter("incrementalTFIDF")))
list.add("-incrementalTFIDF");
if ("true".equalsIgnoreCase(request.getParameter("NER")))
list.add("-NER");
if (!"true".equalsIgnoreCase(request.getParameter("allText")))
list.add("-noalltext");
if ("true".equalsIgnoreCase(request.getParameter("locationsOnly")))
list.add("-locationsOnly");
if ("true".equalsIgnoreCase(request.getParameter("orgsOnly")))
list.add("-orgsOnly");
if ("true".equalsIgnoreCase(request.getParameter("includeQuotedMessages")))
list.add("-includeQuotedMessages");
String subjWeight = request.getParameter("subjectWeight");
if (subjWeight != null) {
list.add("-subjectWeight");
list.add(subjWeight);
}
}
if (!Util.nullOrEmpty(extraOptions))
list.addAll(extraOptions);
String[] s = new String[list.size()];
list.toArray(s);
// careful about the ordering here.. first setup, then read indexer, then run it
Archive archive = Archive.createArchive();
BlobStore blobStore = JSPHelper.preparedBlobStore(baseDir);
archive.setup(baseDir, blobStore, s);
log.info("archive setup in " + baseDir);
return archive;
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class EmailRenderer method getAttachmentDetails.
/*
Method to extract some key details from an attachment that needs to be displayed in a fancybox in the gallery feature.
@chinmay, can we get rid of these escapeHTML and escapeJSON and URLEncode?
*/
private static JsonObject getAttachmentDetails(Archive archive, Blob attachment, Document doc) {
// prepare json object of the information. The format is
// {index:'',href:'', from:'', date:'', subject:'',filename:'',downloadURL:'',tileThumbnailURL:'',msgURL:'',info:''}
// here info field is optional and present only for those attachments which were converted or normalized during the ingestion and therefore need
// to be notified to the user.
JsonObject result = new JsonObject();
String archiveID = ArchiveReaderWriter.getArchiveIDForArchive(archive);
// Extract mail information
// Extract few details like sender, date, message body (ellipsized upto some length) and put them in result.
EmailDocument ed = (EmailDocument) doc;
// A problematic case when converted json object was throwing error in JS.
/*
Case 1: {"from":"Δρ. Θεόδωρος Σίμος \r\n\t- Dr. ***** (redacted)","date":"Jan 5, 2005"}
Solution: escapeJson will escape these to \\r\\n\\t.
Case 2: {"from":"李升荧","date":"Dec 19, 2012","subject":"shangwu@jxdyf.com,Please find
."}
Problem: There is a newline in subject between find and .
Solution: escapejson will put \ at the end of find which will make it parsed correctly.
*/
// escaping because we might have the name like <jbush@..> in the sender.
String sender = Util.escapeHTML(ed.getFromString());
sender = Util.escapeJSON(sender);
String date = Util.escapeHTML(ed.dateString());
String subject = Util.escapeHTML(ed.description);
subject = Util.escapeJSON(subject);
String docId = ed.getUniqueId();
// for caption of the assignment
BlobStore attachmentStore = archive.getBlobStore();
String filename = attachmentStore.full_filename_normalized(attachment, false);
// IMP: We want to open set of all those messages which have this attachment. Therefore we don't use docID to open the message.
// String messageURL = "browse?archiveID="+archiveID+"&docId=" + docId;
// Use browse?archiveID=...&adv-search=1&attachmentFilename= as the msgurl.
result.addProperty("filename", Util.escapeHTML(filename));
String numberedFileName = attachmentStore.full_filename_normalized(attachment, true);
String messageURL = "browse?archiveID=" + archiveID + "&adv-search=1&attachmentFileWithNumber=" + Util.URLEncode(numberedFileName);
result.addProperty("filenameWithIndex", numberedFileName);
result.addProperty("from", sender);
if (ed.hackyDate)
result.addProperty("date", "Undated");
else
result.addProperty("date", date);
result.addProperty("subject", subject);
result.addProperty("msgURL", messageURL);
// Extract few details like attachment src, thumnbail, search for message url etc and put them in result.
// tilethumbnailURL is the url of the image displayed on small tile in the gallery landing page
// thumbnailURL is the url of the image displayed in the gallery mode (inside fancybox). For now both are same but they can be made different
// later therefore the distinction here.
String thumbnailURL = null, downloadURL = null, tileThumbnailURL = null;
if (attachmentStore != null) {
String contentFileDataStoreURL = attachmentStore.get_URL_Normalized(attachment);
// IMP: We need to do URLEncode otherwise if filename contains (') then the object creation from json data fails in the frontend.
// EX. If file's name is Jim's
downloadURL = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLEncode(Util.URLtail(contentFileDataStoreURL));
String tnFileDataStoreURL = attachmentStore.getViewURL(attachment, "tn");
if (tnFileDataStoreURL != null) {
thumbnailURL = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLEncode(Util.URLtail(tnFileDataStoreURL));
// set tile's thumbnail (on the landing page of gallery) also same.
tileThumbnailURL = thumbnailURL;
} else {
if (archive.getBlobStore().is_image(attachment)) {
// and may be wait for the day when both chrome and firefox start supporting them.
if (Util.getExtension(contentFileDataStoreURL).equals("tif")) {
// handle it like non-previewable file.
thumbnailURL = "images/tiff_icon.svg";
tileThumbnailURL = "images/tiff_icon.svg";
} else {
thumbnailURL = downloadURL;
// may be we need to reduce it's size.@TODO
tileThumbnailURL = thumbnailURL;
}
} else if (Util.is_pdf_filename(contentFileDataStoreURL)) {
// because pdfs are treated as doc so better to keep it first.
// thumbnailURL of a pdf can be a pdf image @TODO
thumbnailURL = "images/pdf_icon.svg";
tileThumbnailURL = "images/pdf_icon.svg";
} else if (Util.is_ppt_filename(contentFileDataStoreURL)) {
// same for ppt
// thumbnailURL of a ppt can be a ppt image @TODO
thumbnailURL = "images/ppt_icon.svg";
tileThumbnailURL = "images/ppt_icon.svg";
} else if (Util.is_doc_filename(contentFileDataStoreURL)) {
// thumbnailURL of a doc can be a doc image @TODO
thumbnailURL = "images/doc_icon.svg";
tileThumbnailURL = "images/doc_icon.svg";
} else if (Util.is_zip_filename(contentFileDataStoreURL)) {
// thumbnailURL of a zip can be a zip image @TODO
thumbnailURL = "images/zip_icon.svg";
tileThumbnailURL = "images/zip_icon.svg";
} else {
thumbnailURL = "images/large_sorry_img.svg";
tileThumbnailURL = "images/large_sorry_img.svg";
}
}
} else
JSPHelper.log.warn("attachments store is null!");
if (thumbnailURL == null)
thumbnailURL = "images/large_sorry_img.svg";
// downloadURL should never be null.
boolean isNormalized = attachmentStore.isNormalized(attachment);
boolean isCleanedName = attachmentStore.isCleaned(attachment);
String cleanupurl = attachmentStore.get_URL_Cleanedup(attachment);
String info = "";
if (isNormalized || isCleanedName) {
String completeurl_cleanup = "serveAttachment.jsp?archiveID=" + archiveID + "&file=" + Util.URLEncode(Util.URLtail(cleanupurl));
if (isNormalized) {
info = "This file was converted during the preservation process. Its original name was " + attachmentStore.full_filename_original(attachment, false) + ". Click <a href=" + completeurl_cleanup + ">here </a> to download the original file";
} else if (isCleanedName) {
info = "This file name was cleaned up during the preservation process. The original file name was " + attachmentStore.full_filename_original(attachment, false);
}
}
// {index:'',href:'', from:'', date:'', subject:'',filename:'',downloadURL:'',tileThumbnailURL:'',info:'',size:''}
result.addProperty("size", attachment.size);
result.addProperty("href", thumbnailURL);
result.addProperty("downloadURL", downloadURL);
result.addProperty("tileThumbnailURL", tileThumbnailURL);
if (// add this field only if this is non-empty. (That is the beauty of json, non-fixed structure for the data).
!Util.nullOrEmpty(info))
result.addProperty("info", info);
return result;
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class JSPHelper method preparedBlobStore.
/*
* creates a new blob store object from the given location (may already
* exist) and returns it
*/
private static BlobStore preparedBlobStore(String baseDir) throws IOException {
// always set up attachmentsStore even if we are not fetching attachments
// because the user may already have stuff isn it -- if so, we should make it available.
String attachmentsStoreDir = baseDir + File.separatorChar + Archive.BLOBS_SUBDIR + File.separator;
BlobStore attachmentsStore;
try {
File f = new File(attachmentsStoreDir);
// the return value is not relevant
f.mkdirs();
if (!f.exists() || !f.isDirectory() || !f.canWrite())
throw new IOException("Unable to create directory for writing: " + attachmentsStoreDir);
attachmentsStore = new BlobStore(attachmentsStoreDir);
} catch (IOException ioe) {
log.error("MAJOR ERROR: Disabling attachments because unable to initialize attachments store in directory: " + attachmentsStoreDir + " :" + ioe + " " + Util.stackTrace(ioe));
attachmentsStore = null;
throw (ioe);
}
return attachmentsStore;
}
use of edu.stanford.muse.datacache.BlobStore in project epadd by ePADD.
the class MuseEmailFetcher method fetchAndIndexEmails.
/**
* key method to fetch actual email messages. can take a long time.
* @param session is used only to set the status provider object. callers who do not need to track status can leave it as null
* @param selectedFolders is in the format <account name>^-^<folder name>
* @param session is used only to put a status object in. can be null in which case status object is not set.
* emailDocs, addressBook and blobstore
* @throws NoDefaultFolderException
*/
public void fetchAndIndexEmails(Archive archive, String[] selectedFolders, boolean useDefaultFolders, FetchConfig fetchConfig, HttpSession session, Consumer<StatusProvider> setStatusProvider) throws InterruptedException, JSONException, NoDefaultFolderException, CancelledException {
setupFetchers(-1);
long startTime = System.currentTimeMillis();
setStatusProvider.accept(new StaticStatusProvider("Starting to process messages..."));
// if (session != null)
// session.setAttribute("statusProvider", new StaticStatusProvider("Starting to process messages..."));
boolean op_cancelled = false, out_of_mem = false;
BlobStore attachmentsStore = archive.getBlobStore();
fetchConfig.downloadAttachments = fetchConfig.downloadAttachments && attachmentsStore != null;
if (Util.nullOrEmpty(fetchers)) {
log.warn("Trying to fetch email with no fetchers, setup not called ?");
return;
}
setupFoldersForFetchers(fetchers, selectedFolders, useDefaultFolders);
List<FolderInfo> fetchedFolderInfos = new ArrayList<>();
// one fetcher will aggregate everything
FetchStats stats = new FetchStats();
MTEmailFetcher aggregatingFetcher = null;
// a fetcher is one source, like an account or a top-level mbox dir. A fetcher could include multiple folders.
long startTimeMillis = System.currentTimeMillis();
for (MTEmailFetcher fetcher : fetchers) {
// in theory, different iterations of this loop could be run in parallel ("archive" access will be synchronized)
setStatusProvider.accept(fetcher);
/*if (session != null)
session.setAttribute("statusProvider", fetcher);
*/
fetcher.setArchive(archive);
fetcher.setFetchConfig(fetchConfig);
log.info("Memory status before fetching emails: " + Util.getMemoryStats());
// this is the big call, can run for a long time. Note: running in the same thread, its not fetcher.start();
List<FolderInfo> foldersFetchedByThisFetcher = fetcher.run();
// but don't abort immediately, only at the end, after addressbook has been built for at least the processed messages
if (fetcher.isCancelled()) {
log.info("NOTE: fetcher operation was cancelled");
op_cancelled = true;
break;
}
if (fetcher.mayHaveRunOutOfMemory()) {
log.warn("Fetcher operation ran out of memory " + fetcher);
out_of_mem = true;
break;
}
fetchedFolderInfos.addAll(foldersFetchedByThisFetcher);
if (aggregatingFetcher == null && !Util.nullOrEmpty(foldersFetchedByThisFetcher))
// first non-empty fetcher
aggregatingFetcher = fetcher;
if (aggregatingFetcher != null)
aggregatingFetcher.merge(fetcher);
// add the indexed folders to the stats
EmailStore store = fetcher.getStore();
String fetcherDescription = store.displayName + ":" + store.emailAddress;
for (FolderInfo fi : fetchedFolderInfos) stats.selectedFolders.add(new Pair<>(fetcherDescription, fi));
}
if (op_cancelled)
throw new CancelledException();
if (out_of_mem)
throw new OutOfMemoryError();
if (aggregatingFetcher != null) {
stats.importStats = aggregatingFetcher.stats;
if (aggregatingFetcher.mayHaveRunOutOfMemory())
throw new OutOfMemoryError();
}
// save memory
aggregatingFetcher = null;
long endTimeMillis = System.currentTimeMillis();
long elapsedMillis = endTimeMillis - startTimeMillis;
log.info(elapsedMillis + " ms for fetch+index, Memory status: " + Util.getMemoryStats());
// note: this is all archive docs, not just the ones that may have been just imported
List<EmailDocument> allEmailDocs = (List) archive.getAllDocs();
archive.addFetchedFolderInfos(fetchedFolderInfos);
if (allEmailDocs.size() == 0)
log.warn("0 messages from email fetcher");
// EmailUtils.cleanDates(allEmailDocs);
// create a new address book
// if (session != null)
// session.setAttribute("statusProvider", new StaticStatusProvider("Building address book..."));
setStatusProvider.accept(new StaticStatusProvider("Building address book..."));
AddressBook addressBook = EmailDocument.buildAddressBook(allEmailDocs, archive.ownerEmailAddrs, archive.ownerNames);
log.info("Address book created!!");
log.info("Address book stats: " + addressBook.getStats());
// if (session != null)
// session.setAttribute("statusProvider", new StaticStatusProvider("Finishing up..."));
setStatusProvider.accept(new StaticStatusProvider("Finishing up..."));
archive.setAddressBook(addressBook);
// we shouldn't really have dups now because the archive ensures that only unique docs are added
// move sorting to archive.postprocess?
EmailUtils.removeDupsAndSort(allEmailDocs);
// report stats
stats.lastUpdate = new Date().getTime();
// For issue #254.
stats.archiveOwnerInput = name;
stats.archiveTitleInput = archiveTitle;
stats.primaryEmailInput = alternateEmailAddrs;
stats.emailSourcesInput = emailSources;
// ////
// (String) JSPHelper.getSessionAttribute(session, "userKey");
stats.userKey = "USER KEY UNUSED";
stats.fetchAndIndexTimeMillis = elapsedMillis;
updateStats(archive, addressBook, stats);
// if (session != null)
// session.removeAttribute("statusProvider");
log.info("Fetch+index complete: " + Util.commatize(System.currentTimeMillis() - startTime) + " ms");
}
Aggregations