use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.
the class ParsedMessageTest method rfc2822a5.
/**
* @see http://tools.ietf.org/html/rfc2822#appendix-A.5
*/
@Test
public void rfc2822a5() throws Exception {
String raw = "From: Pete(A wonderful \\) chap) <pete(his account)@(comment)silly.test(his host)>\n" + "To: Chris <c@(xxx bbb)public.example>,\n" + " joe@example.org,\n" + " John <jdoe@one.test> (my dear friend); (the end of the group)\n" + "Cc:(Empty list)(start)Undisclosed recipients :(nobody(that I know)) ;\n" + "Date: Thu,\n" + " 13\n" + " Feb\n" + " 1969\n" + " 23:32\n" + " -0330 (Newfoundland Time)\n" + "Message-ID: <testabcd.1234@silly.test>\n" + "\n" + "Testing.";
ParsedMessage msg = new ParsedMessage(raw.getBytes(), false);
List<IndexDocument> docs = msg.getLuceneDocuments();
Assert.assertEquals(1, docs.size());
Document doc = docs.get(0).toDocument();
RFC822AddressTokenStream from = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_FROM).tokenStreamValue();
Assert.assertEquals(Arrays.asList("pete", "a", "wonderful", "chap", "pete", "his", "account", "comment", "silly.test", "his", "host", "pete@silly.test", "pete", "@silly.test", "silly.test"), from.getAllTokens());
RFC822AddressTokenStream to = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_TO).tokenStreamValue();
Assert.assertEquals(Arrays.asList("chris", "c@", "c", "xxx", "bbb", "public.example", "joe@example.org", "joe", "@example.org", "example.org", "example", "@example", "john", "jdoe@one.test", "jdoe", "@one.test", "one.test", "my", "dear", "friend", "the", "end", "of", "the", "group", "c@public.example", "c", "@public.example", "public.example"), to.getAllTokens());
RFC822AddressTokenStream cc = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_CC).tokenStreamValue();
Assert.assertEquals(Arrays.asList("empty", "list", "start", "undisclosed", "recipients", "nobody", "that", "i", "know"), cc.getAllTokens());
RFC822AddressTokenStream xEnvFrom = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_X_ENV_FROM).tokenStreamValue();
Assert.assertEquals(0, xEnvFrom.getAllTokens().size());
RFC822AddressTokenStream xEnvTo = (RFC822AddressTokenStream) doc.getFieldable(LuceneFields.L_H_X_ENV_TO).tokenStreamValue();
Assert.assertEquals(0, xEnvTo.getAllTokens().size());
}
use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.
the class ParsedContact method getPrimaryDocument.
private IndexDocument getPrimaryDocument(Account acct, String contentStrIn) throws ServiceException {
StringBuilder contentText = new StringBuilder();
String[] emailFields = Contact.getEmailFields(acct);
FieldTokenStream fields = new FieldTokenStream();
for (Map.Entry<String, String> entry : getFields().entrySet()) {
String fieldName = entry.getKey();
// Ignore these fields as they can either be too big or containing encoded data.
if (Contact.isSMIMECertField(fieldName) || ContactConstants.A_member.equals(fieldName) || ContactConstants.A_groupMember.equals(fieldName)) {
continue;
}
if (!Contact.isEmailField(emailFields, fieldName)) {
// skip email addrs, they're added to CONTENT below
if (!ContactConstants.A_fileAs.equalsIgnoreCase(fieldName))
contentText.append(entry.getValue()).append(' ');
}
fields.add(fieldName, entry.getValue());
}
// fetch all the 'email' addresses for this contact into a single concatenated string
// We don't index members in a contact group because it's only confusing when searching.
StringBuilder emails = new StringBuilder();
for (String email : Contact.getEmailAddresses(emailFields, getFields(), DerefGroupMembersOption.NONE)) {
emails.append(email).append(',');
}
RFC822AddressTokenStream to = new RFC822AddressTokenStream(emails.toString());
String emailStrTokens = StringUtil.join(" ", to.getAllTokens());
StringBuilder searchText = new StringBuilder(emailStrTokens).append(' ');
appendContactField(searchText, this, ContactConstants.A_company);
appendContactField(searchText, this, ContactConstants.A_phoneticCompany);
appendContactField(searchText, this, ContactConstants.A_firstName);
appendContactField(searchText, this, ContactConstants.A_phoneticFirstName);
appendContactField(searchText, this, ContactConstants.A_lastName);
appendContactField(searchText, this, ContactConstants.A_phoneticLastName);
appendContactField(searchText, this, ContactConstants.A_nickname);
appendContactField(searchText, this, ContactConstants.A_fullName);
// rebuild contentText here with the emailStr FIRST, then the other text.
// The email addresses should be first so that they have a higher search score than the other
// text
contentText = new StringBuilder(emailStrTokens).append(' ').append(contentText).append(' ').append(contentStrIn);
IndexDocument doc = new IndexDocument();
/* put the email addresses in the "To" field so they can be more easily searched */
doc.addTo(to);
/* put the name in the "From" field since the MailItem table uses 'Sender'*/
doc.addFrom(new RFC822AddressTokenStream(Contact.getFileAsString(contactFields)));
/* bug 11831 - put contact searchable data in its own field so wildcard search works better */
doc.addContactData(searchText.toString());
doc.addContent(contentText.toString());
doc.addPartName(LuceneFields.L_PARTNAME_CONTACT);
// add key:value pairs to the structured FIELD Lucene field
doc.addField(fields);
return doc;
}
use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.
the class ParsedDocument method performExtraction.
/**
* Performs the text extraction lazily if it hasn't been done already
*/
private synchronized void performExtraction() {
try {
long start = System.currentTimeMillis();
MimeHandler handler = MimeHandlerManager.getMimeHandler(contentType, filename);
assert (handler != null);
if (handler.isIndexingEnabled()) {
handler.init(new BlobDataSource(blob, contentType));
}
handler.setFilename(filename);
handler.setPartName(LuceneFields.L_PARTNAME_TOP);
handler.setSize(size);
String textContent = "";
try {
textContent = handler.getContent();
} catch (MimeHandlerException e) {
if (ConversionException.isTemporaryCauseOf(e)) {
ZimbraLog.doc.warn("Temporary failure extracting from the document. (is convertd down?)", e);
temporaryAnalysisFailure = true;
} else {
ZimbraLog.index.warn("Failure indexing wiki document " + filename + ". Item will be partially indexed", e);
}
}
fragment = Fragment.getFragment(textContent, Fragment.Source.NOTEBOOK);
document = new IndexDocument(handler.getDocument());
document.addSubject(filename);
// If the version was changed before extraction, add it in now
if (version > 0) {
document.addVersion(version);
}
StringBuilder content = new StringBuilder();
appendToContent(content, filename);
appendToContent(content, ZimbraAnalyzer.getAllTokensConcatenated(LuceneFields.L_FILENAME, filename));
appendToContent(content, textContent);
appendToContent(content, description);
document.addContent(content.toString());
document.addFrom(new RFC822AddressTokenStream(creator));
document.addFilename(filename);
long elapsed = System.currentTimeMillis() - start;
ZimbraLog.doc.debug("ParsedDocument performExtraction elapsed=" + elapsed);
} catch (MimeHandlerException mhe) {
if (ConversionException.isTemporaryCauseOf(mhe)) {
ZimbraLog.doc.warn("Temporary failure extracting from the document. (is convertd down?)", mhe);
temporaryAnalysisFailure = true;
} else {
ZimbraLog.doc.error("cannot create ParsedDocument", mhe);
}
} catch (Exception e) {
ZimbraLog.index.warn("Failure indexing wiki document " + filename + ". Item will be partially indexed", e);
} finally {
parsed = true;
}
}
use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.
the class CalendarItem method getIndexDocuments.
protected List<IndexDocument> getIndexDocuments() throws TemporaryIndexingException {
List<IndexDocument> toRet = new ArrayList<IndexDocument>();
// when this method is called during commit of cancel operation.
if (numInvites() < 1)
return toRet;
Invite defaultInvite = getDefaultInviteOrNull();
String defaultLocation = "";
if (defaultInvite != null && defaultInvite.getLocation() != null)
defaultLocation = defaultInvite.getLocation();
String defaultName = "";
if (defaultInvite != null && defaultInvite.getName() != null)
defaultName = defaultInvite.getName();
String defaultOrganizer = "";
if (defaultInvite != null && defaultInvite.getOrganizer() != null)
defaultOrganizer = defaultInvite.getOrganizer().getIndexString();
for (Invite inv : getInvites()) {
StringBuilder s = new StringBuilder();
List<String> toAddrs = new ArrayList<String>();
// NAME (subject)
String nameToUse = "";
if (inv.getName() != null) {
s.append(inv.getName()).append(' ');
nameToUse = inv.getName();
} else {
s.append(defaultName).append(' ');
nameToUse = defaultName;
}
// ORGANIZER (from)
String orgToUse = null;
if (inv.getOrganizer() != null) {
String thisInvOrg = inv.getOrganizer().getIndexString();
if (thisInvOrg != null && thisInvOrg.length() > 0)
orgToUse = thisInvOrg;
}
if (orgToUse == null)
orgToUse = defaultOrganizer;
// ATTENDIES (TO)
for (ZAttendee at : inv.getAttendees()) {
try {
toAddrs.add(at.getFriendlyAddress().toString());
s.append(at.getIndexString()).append(' ');
} catch (ServiceException e) {
}
}
s.append(' ');
// LOCATION
if (inv.getLocation() != null) {
s.append(inv.getLocation()).append(' ');
} else {
s.append(defaultLocation).append(' ');
}
// DESCRIPTION
try {
s.append(inv.getDescription()).append(' ');
} catch (ServiceException ex) {
if (ZimbraLog.index.isDebugEnabled()) {
ZimbraLog.index.debug("Caught exception fetching description while indexing CalendarItem " + this.getId() + " skipping", ex);
}
}
// COMMENTS
List<String> comments = inv.getComments();
if (comments != null && !comments.isEmpty()) {
for (String comm : comments) {
s.append(comm).append(' ');
}
}
// CONTACTS
List<String> contacts = inv.getContacts();
if (contacts != null && !contacts.isEmpty()) {
for (String contact : contacts) {
s.append(contact).append(' ');
}
}
// CATEGORIES
List<String> categories = inv.getCategories();
if (categories != null && !categories.isEmpty()) {
for (String cat : categories) {
s.append(cat).append(' ');
}
}
MimeMessage mm = null;
if (!inv.getDontIndexMimeMessage()) {
try {
mm = inv.getMimeMessage();
} catch (ServiceException e) {
if (ZimbraLog.index.isDebugEnabled()) {
ZimbraLog.index.debug("Caught MessagingException for Invite " + inv.toString() + " while fetching MM during indexing of CalendarItem " + this.getId() + " skipping Invite", e);
}
}
}
List<IndexDocument> docList = new ArrayList<IndexDocument>();
if (mm == null) {
// no blob!
IndexDocument doc = new IndexDocument();
// need to properly emulate an indexed Invite message here -- set the TOP partname
doc.addPartName(LuceneFields.L_PARTNAME_TOP);
docList.add(doc);
} else {
try {
ParsedMessage pm = new ParsedMessage(mm, mMailbox.attachmentsIndexingEnabled());
pm.analyzeFully();
if (pm.hasTemporaryAnalysisFailure())
throw new MailItem.TemporaryIndexingException();
docList = pm.getLuceneDocuments();
} catch (ServiceException e) {
if (ZimbraLog.index.isDebugEnabled()) {
ZimbraLog.index.debug("Caught MessagingException for Invite " + inv.toString() + " while indexing CalendarItem " + this.getId() + " skipping Invite", e);
}
}
}
for (IndexDocument doc : docList) {
// update the doc, overriding many of the fields with data from the appointment
doc.addContent(s.toString());
doc.removeTo();
doc.removeFrom();
doc.removeSubject();
for (String to : toAddrs) {
doc.addTo(new RFC822AddressTokenStream(to));
}
doc.addFrom(new RFC822AddressTokenStream(orgToUse));
doc.addSubject(nameToUse);
toRet.add(doc);
}
}
// set the "public"/"private" flag in the index for this appointment
FieldTokenStream fields = new FieldTokenStream(INDEX_FIELD_ITEM_CLASS, isPublic() ? "public" : "private");
for (IndexDocument doc : toRet) {
doc.addField(fields);
}
return toRet;
}
use of com.zimbra.cs.index.analysis.RFC822AddressTokenStream in project zm-mailbox by Zimbra.
the class ParsedMessage method getMainBodyLuceneDocument.
private IndexDocument getMainBodyLuceneDocument(StringBuilder fullContent) throws MessagingException, ServiceException {
IndexDocument doc = new IndexDocument(new Document());
doc.addMimeType(new MimeTypeTokenStream("message/rfc822"));
doc.addPartName(LuceneFields.L_PARTNAME_TOP);
doc.addFrom(getFromTokenStream());
doc.addTo(getToTokenStream());
doc.addCc(getCcTokenStream());
try {
doc.addEnvFrom(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-From", ",")));
} catch (MessagingException ignore) {
}
try {
doc.addEnvTo(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-To", ",")));
} catch (MessagingException ignore) {
}
String msgId = Strings.nullToEmpty(Mime.getHeader(getMimeMessage(), "message-id"));
if (msgId.length() > 0) {
if (msgId.charAt(0) == '<') {
msgId = msgId.substring(1);
}
if (msgId.charAt(msgId.length() - 1) == '>') {
msgId = msgId.substring(0, msgId.length() - 1);
}
if (msgId.length() > 0) {
doc.addMessageId(msgId);
}
}
// iterate all the message headers, add them to the structured-field data in the index
FieldTokenStream fields = new FieldTokenStream();
MimeMessage mm = getMimeMessage();
List<Part> parts = new ArrayList<Part>();
parts.add(mm);
try {
if (mm.getContent() instanceof ZMimeMultipart) {
ZMimeMultipart content = (ZMimeMultipart) mm.getContent();
int numParts = content.getCount();
for (int i = 0; i < numParts; i++) {
parts.add(content.getBodyPart(i));
}
}
} catch (IOException ignore) {
}
for (Part part : parts) {
Enumeration<?> en = part.getAllHeaders();
while (en.hasMoreElements()) {
Header h = (Header) en.nextElement();
String key = h.getName().trim();
String value = h.getValue();
if (value != null) {
value = MimeUtility.unfold(value).trim();
} else {
value = "";
}
if (key.length() > 0) {
if (value.length() == 0) {
// low-level tokenizer can't deal with blank header value, so we'll index
// some dummy value just so the header appears in the index.
// Users can query for the existence of the header with a query
// like #headername:*
fields.add(key, "_blank_");
} else {
fields.add(key, value);
}
}
}
}
// add key:value pairs to the structured FIELD lucene field
doc.addField(fields);
String subject = getSubject();
doc.addSubject(subject);
// add subject and from to main content for better searching
StringBuilder contentPrepend = new StringBuilder(subject);
// Bug 583: add all of the TOKENIZED versions of the email addresses to our CONTENT field...
appendToContent(contentPrepend, StringUtil.join(" ", getFromTokenStream().getAllTokens()));
appendToContent(contentPrepend, StringUtil.join(" ", getToTokenStream().getAllTokens()));
appendToContent(contentPrepend, StringUtil.join(" ", getCcTokenStream().getAllTokens()));
// bug 33461: add filenames to our CONTENT field
for (String fn : filenames) {
appendToContent(contentPrepend, ZimbraAnalyzer.getAllTokensConcatenated(LuceneFields.L_FILENAME, fn));
// also add the non-tokenized form, so full-filename searches match
appendToContent(contentPrepend, fn);
}
String text = contentPrepend.toString() + " " + fullContent.toString();
doc.addContent(text);
try {
MimeHandler.getObjects(text, doc);
} catch (ObjectHandlerException e) {
ZimbraLog.index.warn("Unable to recognize searchable objects in message: msgid=%s,subject=%s", getMessageID(), getSubject(), e);
}
// Get the list of attachment content types from this message and any TNEF attachments
doc.addAttachments(new MimeTypeTokenStream(Mime.getAttachmentTypeList(messageParts)));
return doc;
}
Aggregations