Search in sources :

Example 1 with MimeTypeTokenStream

use of com.zimbra.cs.index.analysis.MimeTypeTokenStream in project zm-mailbox by Zimbra.

the class ParsedMessage method handleParseError.

/**
 * Log the error and index minimum information.
 *
 * @param mpi MIME info
 * @param error error to handle
 */
private void handleParseError(MPartInfo mpi, Throwable error) {
    numParseErrors++;
    LOG.warn("Unable to parse part=%s filename=%s content-type=%s message-id=%s", mpi.getPartName(), mpi.getFilename(), mpi.getContentType(), getMessageID(), error);
    if (ConversionException.isTemporaryCauseOf(error)) {
        temporaryAnalysisFailure = true;
    }
    if (!Strings.isNullOrEmpty(mpi.getFilename())) {
        filenames.add(mpi.getFilename());
    }
    IndexDocument doc = new IndexDocument(new Document());
    doc.addMimeType(new MimeTypeTokenStream(mpi.getContentType()));
    doc.addPartName(mpi.getPartName());
    doc.addFilename(mpi.getFilename());
    try {
        doc.addSortSize(mpi.getMimePart().getSize());
    } catch (MessagingException ignore) {
    }
    luceneDocuments.add(setLuceneHeadersFromContainer(doc));
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) MessagingException(javax.mail.MessagingException) MimeTypeTokenStream(com.zimbra.cs.index.analysis.MimeTypeTokenStream) Document(org.apache.lucene.document.Document) IndexDocument(com.zimbra.cs.index.IndexDocument)

Example 2 with MimeTypeTokenStream

use of com.zimbra.cs.index.analysis.MimeTypeTokenStream in project zm-mailbox by Zimbra.

the class MimeHandler method getDocument.

/**
 * Returns a Lucene document to index this content.
 *
 * @return Lucene document
 * @throws MimeHandlerException if a MIME parser error occurred
 * @throws ObjectHandlerException if a Zimlet error occurred
 * @throws ServiceException if other error occurred
 */
public final Document getDocument() throws MimeHandlerException, ObjectHandlerException, ServiceException {
    IndexDocument doc = new IndexDocument(new Document());
    doc.addMimeType(new MimeTypeTokenStream(getContentType()));
    addFields(doc.toDocument());
    String content = getContent();
    doc.addContent(content);
    getObjects(content, doc);
    doc.addPartName(partName);
    if (dataSource != null) {
        String name = dataSource.getName();
        if (name != null) {
            try {
                name = MimeUtility.decodeText(name);
            } catch (UnsupportedEncodingException ignore) {
            }
            doc.addFilename(name);
        }
    }
    return doc.toDocument();
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) MimeTypeTokenStream(com.zimbra.cs.index.analysis.MimeTypeTokenStream) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IndexDocument(com.zimbra.cs.index.IndexDocument) Document(org.apache.lucene.document.Document)

Example 3 with MimeTypeTokenStream

use of com.zimbra.cs.index.analysis.MimeTypeTokenStream in project zm-mailbox by Zimbra.

the class ParsedMessage method getMainBodyLuceneDocument.

private IndexDocument getMainBodyLuceneDocument(StringBuilder fullContent) throws MessagingException, ServiceException {
    IndexDocument doc = new IndexDocument(new Document());
    doc.addMimeType(new MimeTypeTokenStream("message/rfc822"));
    doc.addPartName(LuceneFields.L_PARTNAME_TOP);
    doc.addFrom(getFromTokenStream());
    doc.addTo(getToTokenStream());
    doc.addCc(getCcTokenStream());
    try {
        doc.addEnvFrom(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-From", ",")));
    } catch (MessagingException ignore) {
    }
    try {
        doc.addEnvTo(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-To", ",")));
    } catch (MessagingException ignore) {
    }
    String msgId = Strings.nullToEmpty(Mime.getHeader(getMimeMessage(), "message-id"));
    if (msgId.length() > 0) {
        if (msgId.charAt(0) == '<') {
            msgId = msgId.substring(1);
        }
        if (msgId.charAt(msgId.length() - 1) == '>') {
            msgId = msgId.substring(0, msgId.length() - 1);
        }
        if (msgId.length() > 0) {
            doc.addMessageId(msgId);
        }
    }
    // iterate all the message headers, add them to the structured-field data in the index
    FieldTokenStream fields = new FieldTokenStream();
    MimeMessage mm = getMimeMessage();
    List<Part> parts = new ArrayList<Part>();
    parts.add(mm);
    try {
        if (mm.getContent() instanceof ZMimeMultipart) {
            ZMimeMultipart content = (ZMimeMultipart) mm.getContent();
            int numParts = content.getCount();
            for (int i = 0; i < numParts; i++) {
                parts.add(content.getBodyPart(i));
            }
        }
    } catch (IOException ignore) {
    }
    for (Part part : parts) {
        Enumeration<?> en = part.getAllHeaders();
        while (en.hasMoreElements()) {
            Header h = (Header) en.nextElement();
            String key = h.getName().trim();
            String value = h.getValue();
            if (value != null) {
                value = MimeUtility.unfold(value).trim();
            } else {
                value = "";
            }
            if (key.length() > 0) {
                if (value.length() == 0) {
                    // low-level tokenizer can't deal with blank header value, so we'll index
                    // some dummy value just so the header appears in the index.
                    // Users can query for the existence of the header with a query
                    // like #headername:*
                    fields.add(key, "_blank_");
                } else {
                    fields.add(key, value);
                }
            }
        }
    }
    // add key:value pairs to the structured FIELD lucene field
    doc.addField(fields);
    String subject = getSubject();
    doc.addSubject(subject);
    // add subject and from to main content for better searching
    StringBuilder contentPrepend = new StringBuilder(subject);
    // Bug 583: add all of the TOKENIZED versions of the email addresses to our CONTENT field...
    appendToContent(contentPrepend, StringUtil.join(" ", getFromTokenStream().getAllTokens()));
    appendToContent(contentPrepend, StringUtil.join(" ", getToTokenStream().getAllTokens()));
    appendToContent(contentPrepend, StringUtil.join(" ", getCcTokenStream().getAllTokens()));
    // bug 33461: add filenames to our CONTENT field
    for (String fn : filenames) {
        appendToContent(contentPrepend, ZimbraAnalyzer.getAllTokensConcatenated(LuceneFields.L_FILENAME, fn));
        // also add the non-tokenized form, so full-filename searches match
        appendToContent(contentPrepend, fn);
    }
    String text = contentPrepend.toString() + " " + fullContent.toString();
    doc.addContent(text);
    try {
        MimeHandler.getObjects(text, doc);
    } catch (ObjectHandlerException e) {
        ZimbraLog.index.warn("Unable to recognize searchable objects in message: msgid=%s,subject=%s", getMessageID(), getSubject(), e);
    }
    // Get the list of attachment content types from this message and any TNEF attachments
    doc.addAttachments(new MimeTypeTokenStream(Mime.getAttachmentTypeList(messageParts)));
    return doc;
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) MessagingException(javax.mail.MessagingException) MimeTypeTokenStream(com.zimbra.cs.index.analysis.MimeTypeTokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) IndexDocument(com.zimbra.cs.index.IndexDocument) RFC822AddressTokenStream(com.zimbra.cs.index.analysis.RFC822AddressTokenStream) Header(javax.mail.Header) ZMimeMessage(com.zimbra.common.zmime.ZMimeMessage) MimeMessage(javax.mail.internet.MimeMessage) Part(javax.mail.Part) FieldTokenStream(com.zimbra.cs.index.analysis.FieldTokenStream) ZMimeMultipart(com.zimbra.common.zmime.ZMimeMultipart) ObjectHandlerException(com.zimbra.cs.object.ObjectHandlerException)

Aggregations

IndexDocument (com.zimbra.cs.index.IndexDocument)3 MimeTypeTokenStream (com.zimbra.cs.index.analysis.MimeTypeTokenStream)3 Document (org.apache.lucene.document.Document)3 MessagingException (javax.mail.MessagingException)2 ZMimeMessage (com.zimbra.common.zmime.ZMimeMessage)1 ZMimeMultipart (com.zimbra.common.zmime.ZMimeMultipart)1 FieldTokenStream (com.zimbra.cs.index.analysis.FieldTokenStream)1 RFC822AddressTokenStream (com.zimbra.cs.index.analysis.RFC822AddressTokenStream)1 ObjectHandlerException (com.zimbra.cs.object.ObjectHandlerException)1 IOException (java.io.IOException)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 Header (javax.mail.Header)1 Part (javax.mail.Part)1 MimeMessage (javax.mail.internet.MimeMessage)1