use of com.zimbra.cs.index.analysis.MimeTypeTokenStream in project zm-mailbox by Zimbra.
the class ParsedMessage method handleParseError.
/**
* Log the error and index minimum information.
*
* @param mpi MIME info
* @param error error to handle
*/
private void handleParseError(MPartInfo mpi, Throwable error) {
numParseErrors++;
LOG.warn("Unable to parse part=%s filename=%s content-type=%s message-id=%s", mpi.getPartName(), mpi.getFilename(), mpi.getContentType(), getMessageID(), error);
if (ConversionException.isTemporaryCauseOf(error)) {
temporaryAnalysisFailure = true;
}
if (!Strings.isNullOrEmpty(mpi.getFilename())) {
filenames.add(mpi.getFilename());
}
IndexDocument doc = new IndexDocument(new Document());
doc.addMimeType(new MimeTypeTokenStream(mpi.getContentType()));
doc.addPartName(mpi.getPartName());
doc.addFilename(mpi.getFilename());
try {
doc.addSortSize(mpi.getMimePart().getSize());
} catch (MessagingException ignore) {
}
luceneDocuments.add(setLuceneHeadersFromContainer(doc));
}
use of com.zimbra.cs.index.analysis.MimeTypeTokenStream in project zm-mailbox by Zimbra.
the class MimeHandler method getDocument.
/**
* Returns a Lucene document to index this content.
*
* @return Lucene document
* @throws MimeHandlerException if a MIME parser error occurred
* @throws ObjectHandlerException if a Zimlet error occurred
* @throws ServiceException if other error occurred
*/
public final Document getDocument() throws MimeHandlerException, ObjectHandlerException, ServiceException {
IndexDocument doc = new IndexDocument(new Document());
doc.addMimeType(new MimeTypeTokenStream(getContentType()));
addFields(doc.toDocument());
String content = getContent();
doc.addContent(content);
getObjects(content, doc);
doc.addPartName(partName);
if (dataSource != null) {
String name = dataSource.getName();
if (name != null) {
try {
name = MimeUtility.decodeText(name);
} catch (UnsupportedEncodingException ignore) {
}
doc.addFilename(name);
}
}
return doc.toDocument();
}
use of com.zimbra.cs.index.analysis.MimeTypeTokenStream in project zm-mailbox by Zimbra.
the class ParsedMessage method getMainBodyLuceneDocument.
private IndexDocument getMainBodyLuceneDocument(StringBuilder fullContent) throws MessagingException, ServiceException {
IndexDocument doc = new IndexDocument(new Document());
doc.addMimeType(new MimeTypeTokenStream("message/rfc822"));
doc.addPartName(LuceneFields.L_PARTNAME_TOP);
doc.addFrom(getFromTokenStream());
doc.addTo(getToTokenStream());
doc.addCc(getCcTokenStream());
try {
doc.addEnvFrom(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-From", ",")));
} catch (MessagingException ignore) {
}
try {
doc.addEnvTo(new RFC822AddressTokenStream(getMimeMessage().getHeader("X-Envelope-To", ",")));
} catch (MessagingException ignore) {
}
String msgId = Strings.nullToEmpty(Mime.getHeader(getMimeMessage(), "message-id"));
if (msgId.length() > 0) {
if (msgId.charAt(0) == '<') {
msgId = msgId.substring(1);
}
if (msgId.charAt(msgId.length() - 1) == '>') {
msgId = msgId.substring(0, msgId.length() - 1);
}
if (msgId.length() > 0) {
doc.addMessageId(msgId);
}
}
// iterate all the message headers, add them to the structured-field data in the index
FieldTokenStream fields = new FieldTokenStream();
MimeMessage mm = getMimeMessage();
List<Part> parts = new ArrayList<Part>();
parts.add(mm);
try {
if (mm.getContent() instanceof ZMimeMultipart) {
ZMimeMultipart content = (ZMimeMultipart) mm.getContent();
int numParts = content.getCount();
for (int i = 0; i < numParts; i++) {
parts.add(content.getBodyPart(i));
}
}
} catch (IOException ignore) {
}
for (Part part : parts) {
Enumeration<?> en = part.getAllHeaders();
while (en.hasMoreElements()) {
Header h = (Header) en.nextElement();
String key = h.getName().trim();
String value = h.getValue();
if (value != null) {
value = MimeUtility.unfold(value).trim();
} else {
value = "";
}
if (key.length() > 0) {
if (value.length() == 0) {
// low-level tokenizer can't deal with blank header value, so we'll index
// some dummy value just so the header appears in the index.
// Users can query for the existence of the header with a query
// like #headername:*
fields.add(key, "_blank_");
} else {
fields.add(key, value);
}
}
}
}
// add key:value pairs to the structured FIELD lucene field
doc.addField(fields);
String subject = getSubject();
doc.addSubject(subject);
// add subject and from to main content for better searching
StringBuilder contentPrepend = new StringBuilder(subject);
// Bug 583: add all of the TOKENIZED versions of the email addresses to our CONTENT field...
appendToContent(contentPrepend, StringUtil.join(" ", getFromTokenStream().getAllTokens()));
appendToContent(contentPrepend, StringUtil.join(" ", getToTokenStream().getAllTokens()));
appendToContent(contentPrepend, StringUtil.join(" ", getCcTokenStream().getAllTokens()));
// bug 33461: add filenames to our CONTENT field
for (String fn : filenames) {
appendToContent(contentPrepend, ZimbraAnalyzer.getAllTokensConcatenated(LuceneFields.L_FILENAME, fn));
// also add the non-tokenized form, so full-filename searches match
appendToContent(contentPrepend, fn);
}
String text = contentPrepend.toString() + " " + fullContent.toString();
doc.addContent(text);
try {
MimeHandler.getObjects(text, doc);
} catch (ObjectHandlerException e) {
ZimbraLog.index.warn("Unable to recognize searchable objects in message: msgid=%s,subject=%s", getMessageID(), getSubject(), e);
}
// Get the list of attachment content types from this message and any TNEF attachments
doc.addAttachments(new MimeTypeTokenStream(Mime.getAttachmentTypeList(messageParts)));
return doc;
}
Aggregations