Search in sources :

Example 1 with RFC822AddressTokenStream

use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.

the class ParsedMessageTest method rfc2822a5.

/**
     * @see http://tools.ietf.org/html/rfc2822#appendix-A.5
     */
@Test
public void rfc2822a5() throws Exception {
    String raw = "From: Pete(A wonderful \\) chap) <pete(his account)@(comment)silly.test(his host)>\n" + "To: Chris <c@(xxx bbb)public.example>,\n" + "         joe@example.org,\n" + "  John <jdoe@one.test> (my dear friend); (the end of the group)\n" + "Cc:(Empty list)(start)Undisclosed recipients  :(nobody(that I know))  ;\n" + "Date: Thu,\n" + "      13\n" + "        Feb\n" + "          1969\n" + "      23:32\n" + "               -0330 (Newfoundland Time)\n" + "Message-ID:              <testabcd.1234@silly.test>\n" + "\n" + "Testing.";
    ParsedMessage msg = new ParsedMessage(raw.getBytes(), false);
    List<IndexDocument> docs = msg.getLuceneDocuments();
    Assert.assertEquals(1, docs.size());
    Document doc = docs.get(0).toDocument();
    RFC822AddressTokenStream from = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_FROM).tokenStreamValue();
    Assert.assertEquals(Arrays.asList("pete", "a", "wonderful", "chap", "pete", "his", "account", "comment", "silly.test", "his", "host", "pete@silly.test", "pete", "@silly.test", "silly.test"), from.getAllTokens());
    RFC822AddressTokenStream to = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_TO).tokenStreamValue();
    Assert.assertEquals(Arrays.asList("chris", "c@", "c", "xxx", "bbb", "public.example", "joe@example.org", "joe", "@example.org", "example.org", "example", "@example", "john", "jdoe@one.test", "jdoe", "@one.test", "one.test", "my", "dear", "friend", "the", "end", "of", "the", "group", "c@public.example", "c", "@public.example", "public.example"), to.getAllTokens());
    RFC822AddressTokenStream cc = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_CC).tokenStreamValue();
    Assert.assertEquals(Arrays.asList("empty", "list", "start", "undisclosed", "recipients", "nobody", "that", "i", "know"), cc.getAllTokens());
    RFC822AddressTokenStream xEnvFrom = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_X_ENV_FROM).tokenStreamValue();
    Assert.assertEquals(0, xEnvFrom.getAllTokens().size());
    RFC822AddressTokenStream xEnvTo = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_X_ENV_TO).tokenStreamValue();
    Assert.assertEquals(0, xEnvTo.getAllTokens().size());
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) IndexDocument(com.zimbra.cs.index.IndexDocument) Document(org.apache.lucene.document.Document) RFC822AddressTokenStream(com.zimbra.cs.index.analysis.RFC822AddressTokenStream) Test(org.junit.Test)

Example 2 with RFC822AddressTokenStream

use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.

the class ParsedContact method getPrimaryDocument.

private IndexDocument getPrimaryDocument(Account acct, String contentStrIn) throws ServiceException {
    StringBuilder contentText = new StringBuilder();
    String[] emailFields = Contact.getEmailFields(acct);
    FieldTokenStream fields = new FieldTokenStream();
    for (Map.Entry<String, String> entry : getFields().entrySet()) {
        String fieldName = entry.getKey();
        // Ignore these fields as they can either be too big or containing encoded data.
        if (Contact.isSMIMECertField(fieldName) || ContactConstants.A_member.equals(fieldName) || ContactConstants.A_groupMember.equals(fieldName)) {
            continue;
        }
        if (!Contact.isEmailField(emailFields, fieldName)) {
            // skip email addrs, they're added to CONTENT below
            if (!ContactConstants.A_fileAs.equalsIgnoreCase(fieldName))
                contentText.append(entry.getValue()).append(' ');
        }
        fields.add(fieldName, entry.getValue());
    }
    // fetch all the 'email' addresses for this contact into a single concatenated string
    // We don't index members in a contact group because it's only confusing when searching.
    StringBuilder emails = new StringBuilder();
    for (String email : Contact.getEmailAddresses(emailFields, getFields(), DerefGroupMembersOption.NONE)) {
        emails.append(email).append(',');
    }
    RFC822AddressTokenStream to = new RFC822AddressTokenStream(emails.toString());
    String emailStrTokens = StringUtil.join(" ", to.getAllTokens());
    StringBuilder searchText = new StringBuilder(emailStrTokens).append(' ');
    appendContactField(searchText, this, ContactConstants.A_company);
    appendContactField(searchText, this, ContactConstants.A_phoneticCompany);
    appendContactField(searchText, this, ContactConstants.A_firstName);
    appendContactField(searchText, this, ContactConstants.A_phoneticFirstName);
    appendContactField(searchText, this, ContactConstants.A_lastName);
    appendContactField(searchText, this, ContactConstants.A_phoneticLastName);
    appendContactField(searchText, this, ContactConstants.A_nickname);
    appendContactField(searchText, this, ContactConstants.A_fullName);
    // rebuild contentText here with the emailStr FIRST, then the other text.
    // The email addresses should be first so that they have a higher search score than the other
    // text
    contentText = new StringBuilder(emailStrTokens).append(' ').append(contentText).append(' ').append(contentStrIn);
    IndexDocument doc = new IndexDocument();
    /* put the email addresses in the "To" field so they can be more easily searched */
    doc.addTo(to);
    /* put the name in the "From" field since the MailItem table uses 'Sender'*/
    doc.addFrom(new RFC822AddressTokenStream(Contact.getFileAsString(contactFields)));
    /* bug 11831 - put contact searchable data in its own field so wildcard search works better  */
    doc.addContactData(searchText.toString());
    doc.addContent(contentText.toString());
    doc.addPartName(LuceneFields.L_PARTNAME_CONTACT);
    // add key:value pairs to the structured FIELD Lucene field
    doc.addField(fields);
    return doc;
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) FieldTokenStream(com.zimbra.cs.index.analysis.FieldTokenStream) RFC822AddressTokenStream(com.zimbra.cs.index.analysis.RFC822AddressTokenStream) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with RFC822AddressTokenStream

use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.

the class ParsedDocument method performExtraction.

/**
     * Performs the text extraction lazily if it hasn't been done already
     */
private synchronized void performExtraction() {
    try {
        long start = System.currentTimeMillis();
        MimeHandler handler = MimeHandlerManager.getMimeHandler(contentType, filename);
        assert (handler != null);
        if (handler.isIndexingEnabled()) {
            handler.init(new BlobDataSource(blob, contentType));
        }
        handler.setFilename(filename);
        handler.setPartName(LuceneFields.L_PARTNAME_TOP);
        handler.setSize(size);
        String textContent = "";
        try {
            textContent = handler.getContent();
        } catch (MimeHandlerException e) {
            if (ConversionException.isTemporaryCauseOf(e)) {
                ZimbraLog.doc.warn("Temporary failure extracting from the document.  (is convertd down?)", e);
                temporaryAnalysisFailure = true;
            } else {
                ZimbraLog.index.warn("Failure indexing wiki document " + filename + ".  Item will be partially indexed", e);
            }
        }
        fragment = Fragment.getFragment(textContent, Fragment.Source.NOTEBOOK);
        document = new IndexDocument(handler.getDocument());
        document.addSubject(filename);
        // If the version was changed before extraction, add it in now
        if (version > 0) {
            document.addVersion(version);
        }
        StringBuilder content = new StringBuilder();
        appendToContent(content, filename);
        appendToContent(content, ZimbraAnalyzer.getAllTokensConcatenated(LuceneFields.L_FILENAME, filename));
        appendToContent(content, textContent);
        appendToContent(content, description);
        document.addContent(content.toString());
        document.addFrom(new RFC822AddressTokenStream(creator));
        document.addFilename(filename);
        long elapsed = System.currentTimeMillis() - start;
        ZimbraLog.doc.debug("ParsedDocument performExtraction elapsed=" + elapsed);
    } catch (MimeHandlerException mhe) {
        if (ConversionException.isTemporaryCauseOf(mhe)) {
            ZimbraLog.doc.warn("Temporary failure extracting from the document.  (is convertd down?)", mhe);
            temporaryAnalysisFailure = true;
        } else {
            ZimbraLog.doc.error("cannot create ParsedDocument", mhe);
        }
    } catch (Exception e) {
        ZimbraLog.index.warn("Failure indexing wiki document " + filename + ".  Item will be partially indexed", e);
    } finally {
        parsed = true;
    }
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) RFC822AddressTokenStream(com.zimbra.cs.index.analysis.RFC822AddressTokenStream) IOException(java.io.IOException) ServiceException(com.zimbra.common.service.ServiceException) ConversionException(com.zimbra.cs.convert.ConversionException)

Example 4 with RFC822AddressTokenStream

use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.

the class CalendarItem method getIndexDocuments.

protected List<IndexDocument> getIndexDocuments() throws TemporaryIndexingException {
    List<IndexDocument> toRet = new ArrayList<IndexDocument>();
    // when this method is called during commit of cancel operation.
    if (numInvites() < 1)
        return toRet;
    Invite defaultInvite = getDefaultInviteOrNull();
    String defaultLocation = "";
    if (defaultInvite != null && defaultInvite.getLocation() != null)
        defaultLocation = defaultInvite.getLocation();
    String defaultName = "";
    if (defaultInvite != null && defaultInvite.getName() != null)
        defaultName = defaultInvite.getName();
    String defaultOrganizer = "";
    if (defaultInvite != null && defaultInvite.getOrganizer() != null)
        defaultOrganizer = defaultInvite.getOrganizer().getIndexString();
    for (Invite inv : getInvites()) {
        StringBuilder s = new StringBuilder();
        List<String> toAddrs = new ArrayList<String>();
        // NAME (subject)
        String nameToUse = "";
        if (inv.getName() != null) {
            s.append(inv.getName()).append(' ');
            nameToUse = inv.getName();
        } else {
            s.append(defaultName).append(' ');
            nameToUse = defaultName;
        }
        // ORGANIZER (from)
        String orgToUse = null;
        if (inv.getOrganizer() != null) {
            String thisInvOrg = inv.getOrganizer().getIndexString();
            if (thisInvOrg != null && thisInvOrg.length() > 0)
                orgToUse = thisInvOrg;
        }
        if (orgToUse == null)
            orgToUse = defaultOrganizer;
        // ATTENDIES (TO)
        for (ZAttendee at : inv.getAttendees()) {
            try {
                toAddrs.add(at.getFriendlyAddress().toString());
                s.append(at.getIndexString()).append(' ');
            } catch (ServiceException e) {
            }
        }
        s.append(' ');
        // LOCATION
        if (inv.getLocation() != null) {
            s.append(inv.getLocation()).append(' ');
        } else {
            s.append(defaultLocation).append(' ');
        }
        // DESCRIPTION
        try {
            s.append(inv.getDescription()).append(' ');
        } catch (ServiceException ex) {
            if (ZimbraLog.index.isDebugEnabled()) {
                ZimbraLog.index.debug("Caught exception fetching description while indexing CalendarItem " + this.getId() + " skipping", ex);
            }
        }
        // COMMENTS
        List<String> comments = inv.getComments();
        if (comments != null && !comments.isEmpty()) {
            for (String comm : comments) {
                s.append(comm).append(' ');
            }
        }
        // CONTACTS
        List<String> contacts = inv.getContacts();
        if (contacts != null && !contacts.isEmpty()) {
            for (String contact : contacts) {
                s.append(contact).append(' ');
            }
        }
        // CATEGORIES
        List<String> categories = inv.getCategories();
        if (categories != null && !categories.isEmpty()) {
            for (String cat : categories) {
                s.append(cat).append(' ');
            }
        }
        MimeMessage mm = null;
        if (!inv.getDontIndexMimeMessage()) {
            try {
                mm = inv.getMimeMessage();
            } catch (ServiceException e) {
                if (ZimbraLog.index.isDebugEnabled()) {
                    ZimbraLog.index.debug("Caught MessagingException for Invite " + inv.toString() + " while fetching MM during indexing of CalendarItem " + this.getId() + " skipping Invite", e);
                }
            }
        }
        List<IndexDocument> docList = new ArrayList<IndexDocument>();
        if (mm == null) {
            // no blob!
            IndexDocument doc = new IndexDocument();
            // need to properly emulate an indexed Invite message here -- set the TOP partname
            doc.addPartName(LuceneFields.L_PARTNAME_TOP);
            docList.add(doc);
        } else {
            try {
                ParsedMessage pm = new ParsedMessage(mm, mMailbox.attachmentsIndexingEnabled());
                pm.analyzeFully();
                if (pm.hasTemporaryAnalysisFailure())
                    throw new MailItem.TemporaryIndexingException();
                docList = pm.getLuceneDocuments();
            } catch (ServiceException e) {
                if (ZimbraLog.index.isDebugEnabled()) {
                    ZimbraLog.index.debug("Caught MessagingException for Invite " + inv.toString() + " while indexing CalendarItem " + this.getId() + " skipping Invite", e);
                }
            }
        }
        for (IndexDocument doc : docList) {
            // update the doc, overriding many of the fields with data from the appointment
            doc.addContent(s.toString());
            doc.removeTo();
            doc.removeFrom();
            doc.removeSubject();
            for (String to : toAddrs) {
                doc.addTo(new RFC822AddressTokenStream(to));
            }
            doc.addFrom(new RFC822AddressTokenStream(orgToUse));
            doc.addSubject(nameToUse);
            toRet.add(doc);
        }
    }
    // set the "public"/"private" flag in the index for this appointment
    FieldTokenStream fields = new FieldTokenStream(INDEX_FIELD_ITEM_CLASS, isPublic() ? "public" : "private");
    for (IndexDocument doc : toRet) {
        doc.addField(fields);
    }
    return toRet;
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) ParsedMessage(com.zimbra.cs.mime.ParsedMessage) ArrayList(java.util.ArrayList) RFC822AddressTokenStream(com.zimbra.cs.index.analysis.RFC822AddressTokenStream) DbMailItem(com.zimbra.cs.db.DbMailItem) ServiceException(com.zimbra.common.service.ServiceException) MimeMessage(javax.mail.internet.MimeMessage) FixedMimeMessage(com.zimbra.cs.mime.Mime.FixedMimeMessage) ZAttendee(com.zimbra.cs.mailbox.calendar.ZAttendee) FieldTokenStream(com.zimbra.cs.index.analysis.FieldTokenStream) Invite(com.zimbra.cs.mailbox.calendar.Invite)

Example 5 with RFC822AddressTokenStream

use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.

the class ParsedMessage method getMainBodyLuceneDocument.

private IndexDocument getMainBodyLuceneDocument(StringBuilder fullContent) throws MessagingException, ServiceException {
    IndexDocument doc = new IndexDocument(new Document());
    doc.addMimeType(new MimeTypeTokenStream("message/rfc822"));
    doc.addPartName(LuceneFields.L_PARTNAME_TOP);
    doc.addFrom(getFromTokenStream());
    doc.addTo(getToTokenStream());
    doc.addCc(getCcTokenStream());
    try {
        doc.addEnvFrom(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-From", ",")));
    } catch (MessagingException ignore) {
    }
    try {
        doc.addEnvTo(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-To", ",")));
    } catch (MessagingException ignore) {
    }
    String msgId = Strings.nullToEmpty(Mime.getHeader(getMimeMessage(), "message-id"));
    if (msgId.length() > 0) {
        if (msgId.charAt(0) == '<') {
            msgId = msgId.substring(1);
        }
        if (msgId.charAt(msgId.length() - 1) == '>') {
            msgId = msgId.substring(0, msgId.length() - 1);
        }
        if (msgId.length() > 0) {
            doc.addMessageId(msgId);
        }
    }
    // iterate all the message headers, add them to the structured-field data in the index
    FieldTokenStream fields = new FieldTokenStream();
    MimeMessage mm = getMimeMessage();
    List<Part> parts = new ArrayList<Part>();
    parts.add(mm);
    try {
        if (mm.getContent() instanceof ZMimeMultipart) {
            ZMimeMultipart content = (ZMimeMultipart) mm.getContent();
            int numParts = content.getCount();
            for (int i = 0; i < numParts; i++) {
                parts.add(content.getBodyPart(i));
            }
        }
    } catch (IOException ignore) {
    }
    for (Part part : parts) {
        Enumeration<?> en = part.getAllHeaders();
        while (en.hasMoreElements()) {
            Header h = (Header) en.nextElement();
            String key = h.getName().trim();
            String value = h.getValue();
            if (value != null) {
                value = MimeUtility.unfold(value).trim();
            } else {
                value = "";
            }
            if (key.length() > 0) {
                if (value.length() == 0) {
                    // low-level tokenizer can't deal with blank header value, so we'll index
                    // some dummy value just so the header appears in the index.
                    // Users can query for the existence of the header with a query
                    // like #headername:*
                    fields.add(key, "_blank_");
                } else {
                    fields.add(key, value);
                }
            }
        }
    }
    // add key:value pairs to the structured FIELD lucene field
    doc.addField(fields);
    String subject = getSubject();
    doc.addSubject(subject);
    // add subject and from to main content for better searching
    StringBuilder contentPrepend = new StringBuilder(subject);
    // Bug 583: add all of the TOKENIZED versions of the email addresses to our CONTENT field...
    appendToContent(contentPrepend, StringUtil.join(" ", getFromTokenStream().getAllTokens()));
    appendToContent(contentPrepend, StringUtil.join(" ", getToTokenStream().getAllTokens()));
    appendToContent(contentPrepend, StringUtil.join(" ", getCcTokenStream().getAllTokens()));
    // bug 33461: add filenames to our CONTENT field
    for (String fn : filenames) {
        appendToContent(contentPrepend, ZimbraAnalyzer.getAllTokensConcatenated(LuceneFields.L_FILENAME, fn));
        // also add the non-tokenized form, so full-filename searches match
        appendToContent(contentPrepend, fn);
    }
    String text = contentPrepend.toString() + " " + fullContent.toString();
    doc.addContent(text);
    try {
        MimeHandler.getObjects(text, doc);
    } catch (ObjectHandlerException e) {
        ZimbraLog.index.warn("Unable to recognize searchable objects in message: msgid=%s,subject=%s", getMessageID(), getSubject(), e);
    }
    // Get the list of attachment content types from this message and any TNEF attachments
    doc.addAttachments(new MimeTypeTokenStream(Mime.getAttachmentTypeList(messageParts)));
    return doc;
}
Also used : IndexDocument(com.zimbra.cs.index.IndexDocument) MessagingException(javax.mail.MessagingException) MimeTypeTokenStream(com.zimbra.cs.index.analysis.MimeTypeTokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) IndexDocument(com.zimbra.cs.index.IndexDocument) RFC822AddressTokenStream(com.zimbra.cs.index.analysis.RFC822AddressTokenStream) Header(javax.mail.Header) ZMimeMessage(com.zimbra.common.zmime.ZMimeMessage) MimeMessage(javax.mail.internet.MimeMessage) Part(javax.mail.Part) FieldTokenStream(com.zimbra.cs.index.analysis.FieldTokenStream) ZMimeMultipart(com.zimbra.common.zmime.ZMimeMultipart) ObjectHandlerException(com.zimbra.cs.object.ObjectHandlerException)

Aggregations

IndexDocument (com.zimbra.cs.index.IndexDocument)5 RFC822AddressTokenStream (com.zimbra.cs.index.analysis.RFC822AddressTokenStream)5 FieldTokenStream (com.zimbra.cs.index.analysis.FieldTokenStream)3 ServiceException (com.zimbra.common.service.ServiceException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 MimeMessage (javax.mail.internet.MimeMessage)2 Document (org.apache.lucene.document.Document)2 ZMimeMessage (com.zimbra.common.zmime.ZMimeMessage)1 ZMimeMultipart (com.zimbra.common.zmime.ZMimeMultipart)1 ConversionException (com.zimbra.cs.convert.ConversionException)1 DbMailItem (com.zimbra.cs.db.DbMailItem)1 MimeTypeTokenStream (com.zimbra.cs.index.analysis.MimeTypeTokenStream)1 Invite (com.zimbra.cs.mailbox.calendar.Invite)1 ZAttendee (com.zimbra.cs.mailbox.calendar.ZAttendee)1 FixedMimeMessage (com.zimbra.cs.mime.Mime.FixedMimeMessage)1 ParsedMessage (com.zimbra.cs.mime.ParsedMessage)1 ObjectHandlerException (com.zimbra.cs.object.ObjectHandlerException)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1