use of edu.stanford.muse.util.Pair in project epadd by ePADD.
the class SearchResult method filterForAttachmentNames.
/**
* this method is a little more specific than attachmentFilename, which only matches the real filename.
* it matches a specific attachment, including its numeric blobstore prefix.
* used when finding message(s) belonging to image wall
*-- After refactoring: If a doc contains at least one attachment of that name then that document
* is retained along with that attachment and all other previously selected attachments.
*/
private static SearchResult filterForAttachmentNames(SearchResult inputSet) {
Collection<String> attachmentTailsList = inputSet.queryParams.get("attachment");
if (Util.nullOrEmpty(attachmentTailsList))
return inputSet;
String[] attachmentTails = attachmentTailsList.toArray(new String[attachmentTailsList.size()]);
Set<String> neededAttachmentTails = new LinkedHashSet<>();
Collections.addAll(neededAttachmentTails, attachmentTails);
Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
inputSet.matchedDocs.keySet().stream().forEach(k -> {
EmailDocument ed = (EmailDocument) k;
Set<Blob> matchedBlobs = new HashSet<>();
for (Blob b : ed.attachments) {
String url = inputSet.archive.blobStore.getRelativeURL(b);
String urlTail = Util.URLtail(url);
if (neededAttachmentTails.contains(urlTail)) {
matchedBlobs.add(b);
}
}
// of this document
if (matchedBlobs.size() != 0) {
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
attachmentHLInfo.addMultipleInfo(matchedBlobs);
outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
}
});
return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
use of edu.stanford.muse.util.Pair in project epadd by ePADD.
the class SearchResult method selectBlobs.
/**
* this map is used only by attachments page right now, not advanced search.
* TODO: make adv. search page also use it
*/
public static SearchResult selectBlobs(SearchResult inputSet) {
Archive archive = inputSet.archive;
Collection<Document> docs = inputSet.archive.getAllDocs();
String neededFilesize = JSPHelper.getParam(inputSet.queryParams, "attachmentFilesize");
String[] extensions = JSPHelper.getParams(inputSet.queryParams, "attachmentExtension").toArray(new String[0]);
// should also have lower-case strings, no "." included
Set<String> extensionsToMatch = new LinkedHashSet<>();
if (!Util.nullOrEmpty(extensions)) {
extensionsToMatch = new LinkedHashSet<>();
for (String s : extensions) extensionsToMatch.add(s.trim().toLowerCase());
}
// or given extensions with extensions due to attachment type
// this will have more semicolon separated extensions
String[] types = JSPHelper.getParams(inputSet.queryParams, "attachmentType").toArray(new String[0]);
if (!Util.nullOrEmpty(types)) {
for (String t : types) {
String exts = Config.attachmentTypeToExtensions.get(t);
if (exts == null)
exts = t;
// continue;
// Front end should uniformly pass attachment types as extensions like mp3;mov;ogg etc. Earlier it was passing vide, audio, doc etc.
// In order to accommodate both cases we first check if there is ampping from the extension type to actual extensions using .get(t)
// if no such mapping is present then we assume that the input extension types are of the form mp3;mov;ogg and work on that.
String[] components = exts.split(";");
Collections.addAll(extensionsToMatch, components);
}
}
// a variable to select if the extensions needed contain others.
boolean isOtherSelected = extensionsToMatch.contains("others");
// get the options that were displayed for attachment types. This will be used to select attachment extensions if the option 'other'
// was selected by the user in the drop down box of export.jsp.
List<String> attachmentTypeOptions = Config.attachmentTypeToExtensions.values().stream().map(x -> Util.tokenize(x, ";")).flatMap(Collection::stream).collect(Collectors.toList());
SearchResult outputSet = filterDocsByDate(inputSet);
// Collection<EmailDocument> eDocs = (Collection) filterDocsByDate (params, new HashSet<>((Collection) docs));
Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
for (Document k : outputSet.matchedDocs.keySet()) {
EmailDocument ed = (EmailDocument) k;
Set<Blob> matchedBlobs = new HashSet<>();
for (Blob b : ed.attachments) {
if (!Util.filesizeCheck(neededFilesize, b.getSize()))
continue;
if (!(Util.nullOrEmpty(extensionsToMatch))) {
Pair<String, String> pair = Util.splitIntoFileBaseAndExtension(archive.getBlobStore().get_URL_Normalized(b));
String ext = pair.getSecond();
if (ext == null)
continue;
ext = ext.toLowerCase();
// Proceed to add this attachment only if either
// 1. other is selected and this extension is not present in the list attachmentOptionType, or
// 2. this extension is present in the variable neededExtensions [Q. What if there is a file with extension .others?]
boolean firstcondition = isOtherSelected && !attachmentTypeOptions.contains(ext);
boolean secondcondition = extensionsToMatch.contains(ext);
if (!firstcondition && !secondcondition)
continue;
}
// ok, we've survived all filters, add b
matchedBlobs.add(b);
}
// of this document
if (matchedBlobs.size() != 0) {
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
attachmentHLInfo.addMultipleInfo(matchedBlobs);
outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
}
}
// Collections.reverse (allAttachments); // reverse, so most recent attachment is first
return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
use of edu.stanford.muse.util.Pair in project epadd by ePADD.
the class SearchResult method filterForAttachments.
/**
* will look in the given docs for a message with an attachment that satisfies all the requirements.
* the set of such messages, along with the matching blobs is returned
* if no requirements, Pair<docs, null> is returned.
*/
private static SearchResult filterForAttachments(SearchResult inputSet) {
Archive archive = inputSet.archive;
String neededFilesize = JSPHelper.getParam(inputSet.queryParams, "attachmentFilesize");
String neededFilename = JSPHelper.getParam(inputSet.queryParams, "attachmentFilename");
// this can come in as a single parameter with multiple values (in case of multiple selections by the user)
Collection<String> neededTypeStr = JSPHelper.getParams(inputSet.queryParams, "attachmentType");
String neededExtensionStr = JSPHelper.getParam(inputSet.queryParams, "attachmentExtension");
// adding support for searching by indexed file name. For example if the file name is image00.png then ePADD adds a unique number before it
// to distinguish with other files of the same name. If the same attachment appears in more than one message (same as in content and filename)
// then the same numbered file is linked to both messages. This support of searching by numbered filename is to support the case when user wants
// to search for a specific image00.png (identified by the number prefix in its name).
String numberedFileNames = JSPHelper.getParam(inputSet.queryParams, "attachmentFileWithNumber");
if (Util.nullOrEmpty(numberedFileNames) && Util.nullOrEmpty(neededFilesize) && Util.nullOrEmpty(neededFilename) && Util.nullOrEmpty(neededTypeStr) && Util.nullOrEmpty(neededExtensionStr)) {
return inputSet;
}
// set up the file names incl. regex pattern if applicable
String neededFilenameRegex = JSPHelper.getParam(inputSet.queryParams, "attachmentFilenameRegex");
Set<String> neededFilenames = null;
Set<String> neededNumberedFilenames = null;
Pattern filenameRegexPattern = null;
if ("on".equals(neededFilenameRegex) && !Util.nullOrEmpty(neededFilename)) {
filenameRegexPattern = Pattern.compile(neededFilename);
} else {
if (// will be in lower case
!Util.nullOrEmpty(neededFilename))
neededFilenames = Util.splitFieldForOr(neededFilename);
}
// parse numberedFileNames if present
if (!Util.nullOrEmpty(numberedFileNames)) {
neededNumberedFilenames = Util.splitFieldForOr(numberedFileNames);
}
// set up the extensions
// will be in lower case
Set<String> neededExtensions = new LinkedHashSet<>();
if (!Util.nullOrEmpty(neededTypeStr) || !Util.nullOrEmpty(neededExtensionStr)) {
// compile the list of all extensions from type (audio/video, etc) and explicitly provided extensions
if (!Util.nullOrEmpty(neededTypeStr)) {
// will be something like "mp3;ogg,avi;mp4" multiselect picker gives us , separated between types, convert it to ;
for (String s : neededTypeStr) neededExtensions.addAll(Util.splitFieldForOr(s));
}
if (!Util.nullOrEmpty(neededExtensionStr)) {
neededExtensions.addAll(Util.splitFieldForOr(neededExtensionStr));
}
} else {
// if attachment type and attachment extensions are not provided fill in the set neededExtensions set
// with the set of all possible extensions/types..
Map<String, String> allTypes = Config.attachmentTypeToExtensions;
for (String s : allTypes.values()) {
neededExtensions.addAll(Util.splitFieldForOr(s));
}
}
// Here we could not use stream's forEach beacause lambda expression can not use non-final variables
// declared outside. Here filenameRegexPattern, neededFilenames were giving error. So changed to
// iteration.
Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
for (Document k : inputSet.matchedDocs.keySet()) {
EmailDocument ed = (EmailDocument) k;
Set<Blob> matchedBlobs = new HashSet<>();
for (Blob b : ed.attachments) {
// does it satisfy all 3 requirements? if we find any condition that it set and doesn't match, bail out of the loop to the next blob
// of course its kinda pointless to specify extension if filename is already specified
// 1. filename matches?
String url = archive.getBlobStore().full_filename_normalized(b, false);
if (filenameRegexPattern == null) {
// non-regex check
if (neededFilenames != null && (url == null || !(url.contains(neededFilename))))
continue;
else if (neededNumberedFilenames != null) {
// check if the numbered name of the file is in the set of neededNumberedFilenames set. if no then continue
String numberedurl = archive.getBlobStore().full_filename_normalized(b, true);
if (!neededNumberedFilenames.contains(numberedurl.toLowerCase()))
continue;
}
} else {
// regex check
if (!Util.nullOrEmpty(neededFilename)) {
if (url == null)
continue;
if (// use find rather than matches because we want partial match on the filename, doesn't have to be full match
!filenameRegexPattern.matcher(url).find())
continue;
}
}
// 2. extension matches?
// a variable to select if the extensions needed contain others.
boolean isOtherSelected = neededExtensions.contains("others");
// get the options that were displayed for attachment types. This will be used to select attachment extensions if the option 'other'
// was selected by the user in the drop down box of export.jsp.
List<String> attachmentTypeOptions = Config.attachmentTypeToExtensions.values().stream().map(x -> Util.tokenize(x, ";")).flatMap(Collection::stream).collect(Collectors.toList());
if (neededExtensions != null) {
if (url == null)
// just over-defensive, if no name, effectively doesn't match
continue;
String extension = Util.getExtension(url);
if (extension == null) {
// It means that this file doesn't have extension. In this case match this only if the needed extension is "unidentified" type.
extension = "Unidentified";
}
extension = extension.toLowerCase();
// Proceed to add this attachment only if either
// 1. other is selected and this extension is not present in the list attachmentOptionType, or
// 2. this extension is present in the variable neededExtensions [Q. What if there is a file with extension .others?]
boolean firstcondition = isOtherSelected && !attachmentTypeOptions.contains(extension);
boolean secondcondition = neededExtensions.contains(extension);
if (!firstcondition && !secondcondition)
continue;
}
// 3. size matches?
long size = b.getSize();
/*
// these attachmentFilesizes parameters are hardcoded -- could make it more flexible if needed in the future
// "1".."5" are the only valid filesizes. If none of these, this parameter not set and we can include the blob
if ("1".equals(neededFilesize) || "2".equals(neededFilesize) || "3".equals(neededFilesize) ||"4".equals(neededFilesize) ||"5".equals(neededFilesize)) { // any other value, we ignore this param
boolean include = ("1".equals(neededFilesize) && size < 5 * KB) ||
("2".equals(neededFilesize) && size >= 5 * KB && size <= 20 * KB) ||
("3".equals(neededFilesize) && size >= 20 * KB && size <= 100 * KB) ||
("4".equals(neededFilesize) && size >= 100 * KB && size <= 2 * KB * KB) ||
("5".equals(neededFilesize) && size >= 2 * KB * KB);
}
*/
boolean include = Util.filesizeCheck(neededFilesize, size);
if (!include)
continue;
// if we reached here, all conditions must be satisfied
matchedBlobs.add(b);
}
// of this document
if (matchedBlobs.size() != 0) {
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
attachmentHLInfo.addMultipleInfo(matchedBlobs);
outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
}
}
return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
use of edu.stanford.muse.util.Pair in project epadd by ePADD.
the class SearchResult method searchForTerm.
/**
* returns SearchResult containing docs and attachments matching the given term.
*
* @param inputSet Input search result object on which this term filtering needs to be done
* @param term term to search for
* @return searchresult obj
*/
public static SearchResult searchForTerm(SearchResult inputSet, String term) {
// go in the order subject, body, attachment
Set<Document> docsForTerm = new LinkedHashSet<>();
SearchResult outputSet;
if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termBody"))) {
Indexer.QueryOptions options = new Indexer.QueryOptions();
options.setQueryType(Indexer.QueryType.FULL);
docsForTerm.addAll(inputSet.archive.docsForQuery(term, options));
} else if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termOriginalBody"))) {
// this is an else because we don't want to look at both body and body original
Indexer.QueryOptions options = new Indexer.QueryOptions();
options.setQueryType(Indexer.QueryType.ORIGINAL);
docsForTerm.addAll(inputSet.archive.docsForQuery(term, options));
} else if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termSubject"))) {
Indexer.QueryOptions options = new Indexer.QueryOptions();
options.setQueryType(Indexer.QueryType.SUBJECT);
docsForTerm.addAll(inputSet.archive.docsForQuery(term, options));
}
Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> attachmentSearchResult;
if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termAttachments"))) {
attachmentSearchResult = new HashMap<>();
Set<Blob> blobsForTerm = inputSet.archive.blobsForQuery(term);
// iterate over 'all attachments' of docs present in 'inputSet'
inputSet.matchedDocs.keySet().stream().forEach(d -> {
EmailDocument edoc = (EmailDocument) d;
Set<Blob> commonAttachments = new HashSet<>(edoc.attachments);
commonAttachments.retainAll(blobsForTerm);
// 0 yes term found in body but not in attachment. keep its info in bodyHLInfo only.
if (commonAttachments.size() > 0) {
if (docsForTerm.contains(edoc)) {
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(d).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(d).second;
// it means the body and the attachment matched the term. add this information in body highliter/attachment highlighter
bhlinfo.addTerm(term);
attachmentHLInfo.addMultipleInfo(commonAttachments);
attachmentSearchResult.put(d, new Pair(bhlinfo, attachmentHLInfo));
} else {
// means only attachment matched the term. add this information in attachment highlighter
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(d).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(d).second;
attachmentHLInfo.addMultipleInfo(commonAttachments);
attachmentSearchResult.put(d, new Pair(bhlinfo, attachmentHLInfo));
}
} else if (commonAttachments.size() == 0 && docsForTerm.contains(d)) {
// means the document had the term only in its body and not in the attachment.
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(d).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(d).second;
bhlinfo.addTerm(term);
attachmentSearchResult.put(d, new Pair(bhlinfo, attachmentHLInfo));
}
});
outputSet = new SearchResult(attachmentSearchResult, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
} else {
// just retain only those document in inputSet.matchedDocs which are present in docsForTerm set.
inputSet.matchedDocs.keySet().retainAll(docsForTerm);
outputSet = inputSet;
}
// blobsForTerm.retainAll(inputSet.matchInAttachment.second);
/*
//query for the docs where these blobs are present. Note that we do not need to search for these blobs in all docs
//only those present in the input search object (matchInAttachment.first) are sufficient as by our invariant of
//matchInAttachment, the set of documents where matchInAttachment.second are present is same as matchInAttachment.first.
Set<Document> blobDocsForTerm = (Set<Document>) EmailUtils.getDocsForAttachments((Collection) inputSet.matchInAttachment.first, blobsForTerm);
attachmentSearchResult = new Pair(blobDocsForTerm,blobsForTerm);
*/
// Add term to common highlighting info (as it is without parsing) for highlighting.
// The term will be in lucene syntax (OR,AND etc.)
// lucene highlighter will take care of highlighting that.
outputSet.commonHLInfo.addTerm(term);
return outputSet;
}
use of edu.stanford.muse.util.Pair in project epadd by ePADD.
the class SearchResult method filterForAttachmentEntities.
/**
******************************ATTACHMENT SPECIFIC FILTERS************************************
*/
/**
* returns only those docs with attachments matching params[attachmentEntity]
* (this field is or-delimiter separated)
* Todo: review usage of this and BlobStore.getKeywordsForBlob()
*/
private static SearchResult filterForAttachmentEntities(SearchResult inputSet) {
String val = JSPHelper.getParam(inputSet.queryParams, "attachmentEntity");
if (Util.nullOrEmpty(val))
return inputSet;
val = val.toLowerCase();
Set<String> entities = Util.splitFieldForOr(val);
BlobStore blobStore = inputSet.archive.blobStore;
Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
inputSet.matchedDocs.keySet().stream().forEach((Document k) -> {
EmailDocument ed = (EmailDocument) k;
// Here.. check for all attachments of ed for match.
Collection<Blob> blobs = ed.attachments;
Set<Blob> matchedBlobs = new HashSet<>();
for (Blob blob : blobs) {
Collection<String> keywords = blobStore.getKeywordsForBlob(blob);
if (keywords != null) {
keywords.retainAll(entities);
if (// it means this blob is of interest, add it to matchedBlobs.
keywords.size() > 0)
matchedBlobs.add(blob);
}
}
// of this document
if (matchedBlobs.size() != 0) {
BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
attachmentHLInfo.addMultipleInfo(matchedBlobs);
outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
}
});
return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
Aggregations