Search in sources :

Example 6 with Blob

use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.

the class EmailFetcherThread method handleAttachments.

/**
 * recursively processes attachments, fetching and saving it if needed
 * parses the given part p, and adds it to hte attachmentsList.
 * in some cases, like a text/html type without a filename, we instead append it to the textlist
 * If this is not an attachment but the text/html part of the message than the html string is returned
 * otherwise null is returned.
 * @throws MessagingException
 */
private String handleAttachments(EmailDocument ed, int idx, Message m, Part p, List<String> textList, List<Blob> attachmentsList) throws MessagingException {
    String ct = null;
    if (!(m instanceof MimeMessage)) {
        Exception e = new IllegalArgumentException("Not a MIME message!");
        e.fillInStackTrace();
        log.warn(Util.stackTrace(e));
        return null;
    }
    String filename = null;
    try {
        filename = p.getFileName();
    } catch (Exception e) {
        // seen this happen with:
        // Folders__gmail-sent Message #12185 Expected ';', got "Message"
        // javax.mail.internet.ParseException: Expected ';', got "Message"
        dataErrors.add("Unable to read attachment name: " + folder_name() + " Message# " + idx);
        Set<String> label = new LinkedHashSet<>();
        label.add(LabelManager.LABELID_ATTCH_ERRS);
        archive.getLabelManager().setLabels(ed.getUniqueId(), label);
        return null;
    }
    String sanitizedFName = Util.sanitizeFolderName(emailStore.getAccountID() + "." + folder_name());
    if (filename == null) {
        String tempFname = sanitizedFName + "." + idx;
        dataErrors.add("attachment filename is null for " + sanitizedFName + " Message#" + idx + " assigning it the name: " + tempFname);
        // assign a special label to this message to denote that there was some problem in parsing.
        Set<String> lab = new LinkedHashSet<>();
        lab.add(LabelManager.LABELID_ATTCH_ERRS);
        getArchive().getLabelManager().setLabels(ed.getUniqueId(), lab);
        if (p.isMimeType("text/html")) {
            try {
                log.info("Turning message " + sanitizedFName + " Message#" + idx + " into text although it is an attachment");
                String html = (String) p.getContent();
                String text = Util.unescapeHTML(html);
                org.jsoup.nodes.Document doc = Jsoup.parse(text);
                StringBuilder sb = new StringBuilder();
                HTMLUtils.extractTextFromHTML(doc.body(), sb);
                textList.add(sb.toString());
                ed.setNoPlainText();
                // Return the html bit so it can be saved as part of the email.
                return (String) html;
            } catch (Exception e) {
                Util.print_exception("Error reading contents of text/html multipart without a filename!", e, log);
                return null;
            }
        }
        filename = tempFname;
    }
    // Replacing any of the disallowed filename characters (\/:*?"<>|&) to _
    // (note: & causes problems with URLs for serveAttachment etc, so it's also replaced)
    String newFilename = Util.sanitizeFileName(filename);
    // Updating filename if it's changed after sanitizing.
    if (!newFilename.equals(filename)) {
        log.info("Filename changed from " + filename + " to " + newFilename);
        filename = newFilename;
    }
    try {
        ct = p.getContentType();
        if (// no ext in filename... let's fix it if possible
        !filename.contains(".")) {
            // Most common APPLICATION TYPE
            if (ct.startsWith("application/pdf"))
                filename = filename + ".pdf";
            if (ct.startsWith("application/zip"))
                filename = filename + ",zip";
            // Most common IMAGE TYPE
            if (ct.startsWith("image/jpeg"))
                filename = filename + ".jpg";
            if (ct.startsWith("image/gif"))
                filename = filename + ".gif";
            if (ct.startsWith("image/png"))
                filename = filename + ".png";
            // Most Common VIDEO TYPE
            if (ct.startsWith("video/x-ms-wmv"))
                filename = filename + ".wmv";
            // Most Common AUDIO TYPE
            if (ct.startsWith("audio/mpeg"))
                filename = filename + ".mp3";
            if (ct.startsWith("audio/mp4"))
                filename = filename + ".mp4";
            // Most Common TEXT TYPE
            if (ct.startsWith("text/html"))
                filename = filename + ".html";
            // Windows Office
            if (// Word
            ct.startsWith("application/vnd.openxmlformats-officedocument.wordprocessingml.document"))
                filename = filename + ".docx";
            if (// Excel
            ct.startsWith("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"))
                filename = filename + ".xlsx";
            if (// PowerPoint
            ct.startsWith("application/vnd.openxmlformats-officedocument.presentationml.presentation"))
                filename = filename + ".pptx";
        }
        // retain only up to first semi-colon; often ct is something like text/plain; name="filename"' we don't want to log the filename
        int x = ct.indexOf(";");
        if (x >= 0)
            ct = ct.substring(0, x);
        log.info("Attachment content type: " + ct + " filename = " + Util.blurKeepingExtension(filename));
    } catch (Exception pex) {
        dataErrors.add("Can't read CONTENT-TYPE: " + ct + " filename:" + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\n Exception: " + pex + "\n" + Util.stackTrace(pex));
        return null;
    }
    // if (filename == null && !p.isMimeType("text/html") && !p.isMimeType("message/partial")) // expected not to have a filename with mime type text/html
    // log.warn ("Attachment filename is null: " + Util.stackTrace());
    boolean success = true;
    // the size passed in here is the part size, which is not really the binary blob size.
    // when we read the stream below in blobStore.add(), we'll set it again to the binary blob size
    Blob b = new EmailAttachmentBlob(filename, p.getSize(), (MimeMessage) m, p);
    // fetchConfig.downloadAttachments=false; Just for testing..
    if (fetchConfig.downloadAttachments) {
        // not on the actual hash
        if (archive.getBlobStore().contains(b)) {
            log.debug("Cache hit! " + b);
        } else {
            try {
                if (filename.endsWith(".tif"))
                    log.info("Fetching attachment..." + Util.blurKeepingExtension(filename));
                // performance critical! use large buffer! currently 256KB
                // stream will be closed by callee
                long start = System.currentTimeMillis();
                long nBytes = archive.getBlobStore().add(b, new BufferedInputStream(p.getInputStream(), 256 * 1024));
                long end = System.currentTimeMillis();
                if (nBytes != -1) {
                    long diff = end - start;
                    String s = "attachment size " + nBytes + " bytes, fetched in " + diff + " millis";
                    if (diff > 0)
                        s += " (" + (nBytes / diff) + " KB/s)";
                    log.info(s);
                }
                Util.ASSERT(archive.getBlobStore().contains(b));
            } catch (IOException ioe) {
                success = false;
                dataErrors.add("WARNING: Unable to fetch attachment: filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe);
                ioe.printStackTrace(System.out);
            }
        }
        if (success) {
            attachmentsList.add(b);
        /*  /// generate thumbnail only if not already cached,
                try {
                    archive.getBlobStore().generate_thumbnail(b); // supplement
                } catch (IOException ioe) {
                    log.warn("failed to create thumbnail, filename: " + filename + " size = " + p.getSize() + " subject: " + m.getSubject() + " Date : " + m.getSentDate().toString() + "\nException: " + ioe);
                    ioe.printStackTrace(System.out);
                }*/
        }
    }
    return null;
}
Also used : Blob(edu.stanford.muse.datacache.Blob) JSONException(org.json.JSONException) DecoderException(org.apache.commons.codec.DecoderException)

Example 7 with Blob

use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.

the class Indexer method indexAttachments.

/**
 * returns whether indexAttachments succeeded
 */
private synchronized boolean indexAttachments(EmailDocument e, BlobStore blobStore, Set<Blob> processedBlobSet, IndexStats stats) throws IOException {
    boolean result = true;
    // bail out if no attachments
    if (e.attachments == null)
        return true;
    final String DELIMITER = "\n";
    for (Blob b : e.attachments) {
        if (processedBlobSet != null && processedBlobSet.contains(b))
            // skip if already processed (blob may be shared by multiple docs)
            continue;
        /*int id_int = iwriter_blob.numDocs();
			String id = Integer.toString(++id_int);*/
        int id_int = blobStore.index(b);
        String id = Integer.toString(id_int);
        if (processedBlobSet != null)
            processedBlobSet.add(b);
        attachmentDocIdToBlob.put(id, b);
        // not to be confused with edu.stanford.muse.index.Document
        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
        Pair<String, String> content = blobStore.getContent(b);
        if (content == null) {
            // failed to process blob
            result = false;
            log.warn("Failed to fetch content from: " + blobStore.get_URL_Normalized(b) + " content type: " + b.contentType + " size: " + b.getSize());
            // but try to continue the process
            continue;
        }
        // imp: for id, should use Field.Index.NOT_ANALYZED field should be http://vuknikolic.wordpress.com/2011/01/03/lucenes-field-options-store-and-index-aka-rtfm/
        // note: id for attachments index is just sequential numbers, 1, 2, 3. etc.
        // it is not the full unique id (<folder>-<num>) that the emails index has.
        // NOTE: docid, emaildocid and languages fields can be stored without position (hence ft as FieldType)
        // because user can not do a phrase query on these fields.
        doc.add(new Field("docId", id, ft));
        // Field type ft instead of StoredFiled so as to be able to search over this field
        doc.add(new Field("emailDocId", e.getUniqueId(), ft));
        String documentText = content.first + DELIMITER + content.second;
        // we'll store all languages detected in the doc as a field in the index
        Set<String> languages = Languages.getAllLanguages(documentText);
        String lang_str = Util.join(languages, LANGUAGE_FIELD_DELIMITER);
        doc.add(new Field("languages", lang_str, ft));
        if (edu.stanford.muse.Config.OPENNLP_NER) {
            Set<String> names = setNameFieldsOpenNLP(documentText, doc);
            // just some connector for storing the field
            String s = Util.join(names, NAMES_FIELD_DELIMITER);
            doc.add(new Field("names", s, full_ft));
            if (stats != null)
                stats.nIndexedNames_blob += names.size();
        }
        // log.info ("blob metadata = " + content.first);
        // meta data does not contain the fileName
        doc.add(new Field("meta", content.first, full_ft));
        doc.add(new Field("fileName", blobStore.get_URL_Normalized(b), full_ft));
        // don't tokenize if the content.first is of type zip or gzip which means use ft instead of full_ft
        if (content.first.contains("text"))
            doc.add(new Field("body", content.second, full_ft));
        // else
        // don't add body field if we can not search this field for the non-text type.
        // Earlier we had resorted to using "ft" (non-positional) indexing but two different type of indexing for the same field gave error
        // while performing phrase query.
        // doc.add(new Field("body", content.second, ft));
        iwriter_blob.addDocument(doc);
        // log.info("Indexed attachment #" + id + " : text = '" + documentText + "' names = '" + s + "'");
        if (stats != null) {
            stats.indexedTextLength_blob += documentText.length();
        }
    }
    return result;
}
Also used : Blob(edu.stanford.muse.datacache.Blob) Field(org.apache.lucene.document.Field)

Example 8 with Blob

use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.

the class SearchResult method selectBlobs.

/**
 * this map is used only by attachments page right now, not advanced search.
 * TODO: make adv. search page also use it
 */
public static SearchResult selectBlobs(SearchResult inputSet) {
    Archive archive = inputSet.archive;
    Collection<Document> docs = inputSet.archive.getAllDocs();
    String neededFilesize = JSPHelper.getParam(inputSet.queryParams, "attachmentFilesize");
    String[] extensions = JSPHelper.getParams(inputSet.queryParams, "attachmentExtension").toArray(new String[0]);
    // should also have lower-case strings, no "." included
    Set<String> extensionsToMatch = new LinkedHashSet<>();
    if (!Util.nullOrEmpty(extensions)) {
        extensionsToMatch = new LinkedHashSet<>();
        for (String s : extensions) extensionsToMatch.add(s.trim().toLowerCase());
    }
    // or given extensions with extensions due to attachment type
    // this will have more semicolon separated extensions
    String[] types = JSPHelper.getParams(inputSet.queryParams, "attachmentType").toArray(new String[0]);
    if (!Util.nullOrEmpty(types)) {
        for (String t : types) {
            String exts = Config.attachmentTypeToExtensions.get(t);
            if (exts == null)
                exts = t;
            // continue;
            // Front end should uniformly pass attachment types as extensions like mp3;mov;ogg etc. Earlier it was passing vide, audio, doc etc.
            // In order to accommodate both cases we first check if there is ampping from the extension type to actual extensions using .get(t)
            // if no such mapping is present then we assume that the input extension types are of the form mp3;mov;ogg and work on that.
            String[] components = exts.split(";");
            Collections.addAll(extensionsToMatch, components);
        }
    }
    // a variable to select if the extensions needed contain others.
    boolean isOtherSelected = extensionsToMatch.contains("others");
    // get the options that were displayed for attachment types. This will be used to select attachment extensions if the option 'other'
    // was selected by the user in the drop down box of export.jsp.
    List<String> attachmentTypeOptions = Config.attachmentTypeToExtensions.values().stream().map(x -> Util.tokenize(x, ";")).flatMap(Collection::stream).collect(Collectors.toList());
    SearchResult outputSet = filterDocsByDate(inputSet);
    // Collection<EmailDocument> eDocs = (Collection) filterDocsByDate (params, new HashSet<>((Collection) docs));
    Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
    for (Document k : outputSet.matchedDocs.keySet()) {
        EmailDocument ed = (EmailDocument) k;
        Set<Blob> matchedBlobs = new HashSet<>();
        for (Blob b : ed.attachments) {
            if (!Util.filesizeCheck(neededFilesize, b.getSize()))
                continue;
            if (!(Util.nullOrEmpty(extensionsToMatch))) {
                Pair<String, String> pair = Util.splitIntoFileBaseAndExtension(archive.getBlobStore().get_URL_Normalized(b));
                String ext = pair.getSecond();
                if (ext == null)
                    continue;
                ext = ext.toLowerCase();
                // Proceed to add this attachment only if either
                // 1. other is selected and this extension is not present in the list attachmentOptionType, or
                // 2. this extension is present in the variable neededExtensions [Q. What if there is a file with extension .others?]
                boolean firstcondition = isOtherSelected && !attachmentTypeOptions.contains(ext);
                boolean secondcondition = extensionsToMatch.contains(ext);
                if (!firstcondition && !secondcondition)
                    continue;
            }
            // ok, we've survived all filters, add b
            matchedBlobs.add(b);
        }
        // of this document
        if (matchedBlobs.size() != 0) {
            BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
            AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
            attachmentHLInfo.addMultipleInfo(matchedBlobs);
            outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
        }
    }
    // Collections.reverse (allAttachments); // reverse, so most recent attachment is first
    return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
Also used : Blob(edu.stanford.muse.datacache.Blob) Pair(edu.stanford.muse.util.Pair)

Example 9 with Blob

use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.

the class SearchResult method filterForAttachments.

/**
 * will look in the given docs for a message with an attachment that satisfies all the requirements.
 * the set of such messages, along with the matching blobs is returned
 * if no requirements, Pair<docs, null> is returned.
 */
private static SearchResult filterForAttachments(SearchResult inputSet) {
    Archive archive = inputSet.archive;
    String neededFilesize = JSPHelper.getParam(inputSet.queryParams, "attachmentFilesize");
    String neededFilename = JSPHelper.getParam(inputSet.queryParams, "attachmentFilename");
    // this can come in as a single parameter with multiple values (in case of multiple selections by the user)
    Collection<String> neededTypeStr = JSPHelper.getParams(inputSet.queryParams, "attachmentType");
    String neededExtensionStr = JSPHelper.getParam(inputSet.queryParams, "attachmentExtension");
    // adding support for searching by indexed file name. For example if the file name is image00.png then ePADD adds a unique number before it
    // to distinguish with other files of the same name. If the same attachment appears in more than one message (same as in content and filename)
    // then the same numbered file is linked to both messages. This support of searching by numbered filename is to support the case when user wants
    // to search for a specific image00.png (identified by the number prefix in its name).
    String numberedFileNames = JSPHelper.getParam(inputSet.queryParams, "attachmentFileWithNumber");
    if (Util.nullOrEmpty(numberedFileNames) && Util.nullOrEmpty(neededFilesize) && Util.nullOrEmpty(neededFilename) && Util.nullOrEmpty(neededTypeStr) && Util.nullOrEmpty(neededExtensionStr)) {
        return inputSet;
    }
    // set up the file names incl. regex pattern if applicable
    String neededFilenameRegex = JSPHelper.getParam(inputSet.queryParams, "attachmentFilenameRegex");
    Set<String> neededFilenames = null;
    Set<String> neededNumberedFilenames = null;
    Pattern filenameRegexPattern = null;
    if ("on".equals(neededFilenameRegex) && !Util.nullOrEmpty(neededFilename)) {
        filenameRegexPattern = Pattern.compile(neededFilename);
    } else {
        if (// will be in lower case
        !Util.nullOrEmpty(neededFilename))
            neededFilenames = Util.splitFieldForOr(neededFilename);
    }
    // parse numberedFileNames if present
    if (!Util.nullOrEmpty(numberedFileNames)) {
        neededNumberedFilenames = Util.splitFieldForOr(numberedFileNames);
    }
    // set up the extensions
    // will be in lower case
    Set<String> neededExtensions = new LinkedHashSet<>();
    if (!Util.nullOrEmpty(neededTypeStr) || !Util.nullOrEmpty(neededExtensionStr)) {
        // compile the list of all extensions from type (audio/video, etc) and explicitly provided extensions
        if (!Util.nullOrEmpty(neededTypeStr)) {
            // will be something like "mp3;ogg,avi;mp4" multiselect picker gives us , separated between types, convert it to ;
            for (String s : neededTypeStr) neededExtensions.addAll(Util.splitFieldForOr(s));
        }
        if (!Util.nullOrEmpty(neededExtensionStr)) {
            neededExtensions.addAll(Util.splitFieldForOr(neededExtensionStr));
        }
    } else {
        // if attachment type and attachment extensions are not provided fill in the set neededExtensions set
        // with the set of all possible extensions/types..
        Map<String, String> allTypes = Config.attachmentTypeToExtensions;
        for (String s : allTypes.values()) {
            neededExtensions.addAll(Util.splitFieldForOr(s));
        }
    }
    // Here we could not use stream's forEach beacause lambda expression can not use non-final variables
    // declared outside. Here filenameRegexPattern, neededFilenames were giving error. So changed to
    // iteration.
    Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> outputDocs = new HashMap<>();
    for (Document k : inputSet.matchedDocs.keySet()) {
        EmailDocument ed = (EmailDocument) k;
        Set<Blob> matchedBlobs = new HashSet<>();
        for (Blob b : ed.attachments) {
            // does it satisfy all 3 requirements? if we find any condition that it set and doesn't match, bail out of the loop to the next blob
            // of course its kinda pointless to specify extension if filename is already specified
            // 1. filename matches?
            String url = archive.getBlobStore().full_filename_normalized(b, false);
            if (filenameRegexPattern == null) {
                // non-regex check
                if (neededFilenames != null && (url == null || !(url.contains(neededFilename))))
                    continue;
                else if (neededNumberedFilenames != null) {
                    // check if the numbered name of the file is in the set of neededNumberedFilenames set. if no then continue
                    String numberedurl = archive.getBlobStore().full_filename_normalized(b, true);
                    if (!neededNumberedFilenames.contains(numberedurl.toLowerCase()))
                        continue;
                }
            } else {
                // regex check
                if (!Util.nullOrEmpty(neededFilename)) {
                    if (url == null)
                        continue;
                    if (// use find rather than matches because we want partial match on the filename, doesn't have to be full match
                    !filenameRegexPattern.matcher(url).find())
                        continue;
                }
            }
            // 2. extension matches?
            // a variable to select if the extensions needed contain others.
            boolean isOtherSelected = neededExtensions.contains("others");
            // get the options that were displayed for attachment types. This will be used to select attachment extensions if the option 'other'
            // was selected by the user in the drop down box of export.jsp.
            List<String> attachmentTypeOptions = Config.attachmentTypeToExtensions.values().stream().map(x -> Util.tokenize(x, ";")).flatMap(Collection::stream).collect(Collectors.toList());
            if (neededExtensions != null) {
                if (url == null)
                    // just over-defensive, if no name, effectively doesn't match
                    continue;
                String extension = Util.getExtension(url);
                if (extension == null) {
                    // It means that this file doesn't have extension. In this case match this only if the needed extension is "unidentified" type.
                    extension = "Unidentified";
                }
                extension = extension.toLowerCase();
                // Proceed to add this attachment only if either
                // 1. other is selected and this extension is not present in the list attachmentOptionType, or
                // 2. this extension is present in the variable neededExtensions [Q. What if there is a file with extension .others?]
                boolean firstcondition = isOtherSelected && !attachmentTypeOptions.contains(extension);
                boolean secondcondition = neededExtensions.contains(extension);
                if (!firstcondition && !secondcondition)
                    continue;
            }
            // 3. size matches?
            long size = b.getSize();
            /*
                // these attachmentFilesizes parameters are hardcoded -- could make it more flexible if needed in the future
                // "1".."5" are the only valid filesizes. If none of these, this parameter not set and we can include the blob
                if ("1".equals(neededFilesize) || "2".equals(neededFilesize) || "3".equals(neededFilesize) ||"4".equals(neededFilesize) ||"5".equals(neededFilesize)) { // any other value, we ignore this param
                    boolean include = ("1".equals(neededFilesize) && size < 5 * KB) ||
                            ("2".equals(neededFilesize) && size >= 5 * KB && size <= 20 * KB) ||
                            ("3".equals(neededFilesize) && size >= 20 * KB && size <= 100 * KB) ||
                            ("4".equals(neededFilesize) && size >= 100 * KB && size <= 2 * KB * KB) ||
                            ("5".equals(neededFilesize) && size >= 2 * KB * KB);
                }
                */
            boolean include = Util.filesizeCheck(neededFilesize, size);
            if (!include)
                continue;
            // if we reached here, all conditions must be satisfied
            matchedBlobs.add(b);
        }
        // of this document
        if (matchedBlobs.size() != 0) {
            BodyHLInfo bhlinfo = inputSet.matchedDocs.get(k).first;
            AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(k).second;
            attachmentHLInfo.addMultipleInfo(matchedBlobs);
            outputDocs.put(k, new Pair(bhlinfo, attachmentHLInfo));
        }
    }
    return new SearchResult(outputDocs, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
}
Also used : Pattern(java.util.regex.Pattern) Blob(edu.stanford.muse.datacache.Blob) Pair(edu.stanford.muse.util.Pair)

Example 10 with Blob

use of edu.stanford.muse.datacache.Blob in project epadd by ePADD.

the class SearchResult method searchForTerm.

/**
 * returns SearchResult containing docs and attachments matching the given term.
 *
 * @param inputSet Input search result object on which this term filtering needs to be done
 * @param term     term to search for
 * @return searchresult obj
 */
public static SearchResult searchForTerm(SearchResult inputSet, String term) {
    // go in the order subject, body, attachment
    Set<Document> docsForTerm = new LinkedHashSet<>();
    SearchResult outputSet;
    if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termBody"))) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setQueryType(Indexer.QueryType.FULL);
        docsForTerm.addAll(inputSet.archive.docsForQuery(term, options));
    } else if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termOriginalBody"))) {
        // this is an else because we don't want to look at both body and body original
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setQueryType(Indexer.QueryType.ORIGINAL);
        docsForTerm.addAll(inputSet.archive.docsForQuery(term, options));
    } else if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termSubject"))) {
        Indexer.QueryOptions options = new Indexer.QueryOptions();
        options.setQueryType(Indexer.QueryType.SUBJECT);
        docsForTerm.addAll(inputSet.archive.docsForQuery(term, options));
    }
    Map<Document, Pair<BodyHLInfo, AttachmentHLInfo>> attachmentSearchResult;
    if ("on".equals(JSPHelper.getParam(inputSet.queryParams, "termAttachments"))) {
        attachmentSearchResult = new HashMap<>();
        Set<Blob> blobsForTerm = inputSet.archive.blobsForQuery(term);
        // iterate over 'all attachments' of docs present in 'inputSet'
        inputSet.matchedDocs.keySet().stream().forEach(d -> {
            EmailDocument edoc = (EmailDocument) d;
            Set<Blob> commonAttachments = new HashSet<>(edoc.attachments);
            commonAttachments.retainAll(blobsForTerm);
            // 0         yes        term found in body but not in attachment. keep its info in bodyHLInfo only.
            if (commonAttachments.size() > 0) {
                if (docsForTerm.contains(edoc)) {
                    BodyHLInfo bhlinfo = inputSet.matchedDocs.get(d).first;
                    AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(d).second;
                    // it means the body and the attachment matched the term. add this information in body highliter/attachment highlighter
                    bhlinfo.addTerm(term);
                    attachmentHLInfo.addMultipleInfo(commonAttachments);
                    attachmentSearchResult.put(d, new Pair(bhlinfo, attachmentHLInfo));
                } else {
                    // means only attachment matched the term. add this information in attachment highlighter
                    BodyHLInfo bhlinfo = inputSet.matchedDocs.get(d).first;
                    AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(d).second;
                    attachmentHLInfo.addMultipleInfo(commonAttachments);
                    attachmentSearchResult.put(d, new Pair(bhlinfo, attachmentHLInfo));
                }
            } else if (commonAttachments.size() == 0 && docsForTerm.contains(d)) {
                // means the document had the term only in its body and not in the attachment.
                BodyHLInfo bhlinfo = inputSet.matchedDocs.get(d).first;
                AttachmentHLInfo attachmentHLInfo = inputSet.matchedDocs.get(d).second;
                bhlinfo.addTerm(term);
                attachmentSearchResult.put(d, new Pair(bhlinfo, attachmentHLInfo));
            }
        });
        outputSet = new SearchResult(attachmentSearchResult, inputSet.archive, inputSet.queryParams, inputSet.commonHLInfo, inputSet.regexToHighlight);
    } else {
        // just retain only those document in inputSet.matchedDocs which are present in docsForTerm set.
        inputSet.matchedDocs.keySet().retainAll(docsForTerm);
        outputSet = inputSet;
    }
    // blobsForTerm.retainAll(inputSet.matchInAttachment.second);
    /*
        //query for the docs where these blobs are present. Note that we do not need to search for these blobs in all docs
        //only those present in the input search object (matchInAttachment.first) are sufficient as by our invariant of
        //matchInAttachment, the set of documents where matchInAttachment.second are present is same as matchInAttachment.first.
        Set<Document> blobDocsForTerm = (Set<Document>) EmailUtils.getDocsForAttachments((Collection) inputSet.matchInAttachment.first, blobsForTerm);
        attachmentSearchResult = new Pair(blobDocsForTerm,blobsForTerm);
        */
    // Add term to common highlighting info (as it is without parsing) for highlighting.
    // The term will be in lucene syntax (OR,AND etc.)
    // lucene highlighter will take care of highlighting that.
    outputSet.commonHLInfo.addTerm(term);
    return outputSet;
}
Also used : Blob(edu.stanford.muse.datacache.Blob) Pair(edu.stanford.muse.util.Pair)

Aggregations

Blob (edu.stanford.muse.datacache.Blob)29 Pair (edu.stanford.muse.util.Pair)7 Pattern (java.util.regex.Pattern)5 BlobStore (edu.stanford.muse.datacache.BlobStore)4 Field (org.apache.lucene.document.Field)4 JSONException (org.json.JSONException)3 LinkedHashMultimap (com.google.common.collect.LinkedHashMultimap)2 Multimap (com.google.common.collect.Multimap)2 Gson (com.google.gson.Gson)2 AddressBook (edu.stanford.muse.AddressBookManager.AddressBook)2 Contact (edu.stanford.muse.AddressBookManager.Contact)2 CorrespondentAuthorityMapper (edu.stanford.muse.AddressBookManager.CorrespondentAuthorityMapper)2 AnnotationManager (edu.stanford.muse.AnnotationManager.AnnotationManager)2 Config (edu.stanford.muse.Config)2 Label (edu.stanford.muse.LabelManager.Label)2 LabelManager (edu.stanford.muse.LabelManager.LabelManager)2 edu.stanford.muse.email (edu.stanford.muse.email)2 NameInfo (edu.stanford.muse.ie.NameInfo)2 Document (edu.stanford.muse.index.Document)2 EmailDocument (edu.stanford.muse.index.EmailDocument)2