use of com.zimbra.cs.index.IndexDocument in project zm-mailbox by Zimbra.
the class Document method generateIndexData.
@Override
public List<IndexDocument> generateIndexData() throws TemporaryIndexingException {
try {
MailboxBlob mblob = getBlob();
if (mblob == null) {
ZimbraLog.index.warn("Unable to fetch blob for Document id=%d,ver=%d,vol=%s", mId, mVersion, getLocator());
throw new MailItem.TemporaryIndexingException();
}
ParsedDocument pd = null;
pd = new ParsedDocument(mblob.getLocalBlob(), getName(), getContentType(), getChangeDate(), getCreator(), getDescription(), isDescriptionEnabled());
if (pd.hasTemporaryAnalysisFailure()) {
throw new MailItem.TemporaryIndexingException();
}
IndexDocument doc = pd.getDocument();
if (doc != null) {
List<IndexDocument> toRet = new ArrayList<IndexDocument>(1);
toRet.add(doc);
return toRet;
} else {
return new ArrayList<IndexDocument>(0);
}
} catch (IOException e) {
ZimbraLog.index.warn("Error generating index data for Wiki Document " + getId() + ". Item will not be indexed", e);
return new ArrayList<IndexDocument>(0);
} catch (ServiceException e) {
ZimbraLog.index.warn("Error generating index data for Wiki Document " + getId() + ". Item will not be indexed", e);
return new ArrayList<IndexDocument>(0);
}
}
use of com.zimbra.cs.index.IndexDocument in project zm-mailbox by Zimbra.
the class ParsedMessage method analyzePart.
/**
* @return Extracted toplevel text (any text that should go into the toplevel indexed document)
*/
private String analyzePart(boolean isMainBody, MPartInfo mpi) throws MessagingException, ServiceException {
boolean ignoreCalendar;
if (calendarPartInfo == null) {
ignoreCalendar = isBouncedCalendar(mpi);
} else {
ignoreCalendar = true;
}
String methodParam = (new ContentType(mpi.getMimePart().getContentType())).getParameter("method");
if (methodParam == null && !LC.calendar_allow_invite_without_method.booleanValue()) {
ignoreCalendar = true;
}
String toRet = "";
try {
// ignore multipart "container" parts
if (mpi.isMultipart()) {
return toRet;
}
String ctype = mpi.getContentType();
MimeHandler handler = MimeHandlerManager.getMimeHandler(ctype, mpi.getFilename());
assert (handler != null);
handler.setDefaultCharset(defaultCharset);
Mime.repairTransferEncoding(mpi.getMimePart());
if (handler.isIndexingEnabled()) {
handler.init(mpi.getMimePart().getDataHandler().getDataSource());
handler.setPartName(mpi.getPartName());
handler.setFilename(mpi.getFilename());
handler.setSize(mpi.getSize());
// remember the first iCalendar attachment
if (!ignoreCalendar && calendarPartInfo == null) {
ZVCalendar cal = handler.getICalendar();
if (cal != null) {
setCalendarPartInfo(mpi, cal);
}
}
// - IndexAttachments was set and !disableIndexingAttachmentsTogether
if ((isMainBody && (!handler.runsExternally() || indexAttachments)) || (indexAttachments && !DebugConfig.disableIndexingAttachmentsTogether)) {
toRet = handler.getContent();
}
if (indexAttachments && !DebugConfig.disableIndexingAttachmentsSeparately) {
// Each non-text MIME part is also indexed as a separate
// Lucene document. This is necessary so that we can tell the
// client what parts match if a search matched a particular
// part.
IndexDocument doc = new IndexDocument(handler.getDocument());
String filename = handler.getFilename();
if (!Strings.isNullOrEmpty(filename)) {
filenames.add(filename);
}
doc.addSortSize(mpi.getMimePart().getSize());
luceneDocuments.add(setLuceneHeadersFromContainer(doc));
}
}
// make sure we've got the text/calendar handler installed
if (!ignoreCalendar && calendarPartInfo == null && ctype.equals(MimeConstants.CT_TEXT_CALENDAR)) {
if (handler.isIndexingEnabled()) {
ZimbraLog.index.warn("TextCalendarHandler not correctly installed");
}
InputStream is = null;
try {
String charset = mpi.getContentTypeParameter(MimeConstants.P_CHARSET);
if (charset == null || charset.trim().isEmpty()) {
charset = MimeConstants.P_CHARSET_DEFAULT;
}
is = mpi.getMimePart().getInputStream();
ZVCalendar cal = ZCalendarBuilder.build(is, charset);
if (cal != null) {
setCalendarPartInfo(mpi, cal);
}
} catch (IOException ioe) {
ZimbraLog.index.warn("error reading text/calendar mime part", ioe);
} finally {
ByteUtil.closeStream(is);
}
}
} catch (MimeHandlerException e) {
handleParseError(mpi, e);
} catch (ObjectHandlerException e) {
handleParseError(mpi, e);
}
return toRet;
}
use of com.zimbra.cs.index.IndexDocument in project zm-mailbox by Zimbra.
the class ParsedContact method getPrimaryDocument.
private IndexDocument getPrimaryDocument(Account acct, String contentStrIn) throws ServiceException {
StringBuilder contentText = new StringBuilder();
String[] emailFields = Contact.getEmailFields(acct);
FieldTokenStream fields = new FieldTokenStream();
for (Map.Entry<String, String> entry : getFields().entrySet()) {
String fieldName = entry.getKey();
// Ignore these fields as they can either be too big or containing encoded data.
if (Contact.isSMIMECertField(fieldName) || ContactConstants.A_member.equals(fieldName) || ContactConstants.A_groupMember.equals(fieldName)) {
continue;
}
if (!Contact.isEmailField(emailFields, fieldName)) {
// skip email addrs, they're added to CONTENT below
if (!ContactConstants.A_fileAs.equalsIgnoreCase(fieldName))
contentText.append(entry.getValue()).append(' ');
}
fields.add(fieldName, entry.getValue());
}
// fetch all the 'email' addresses for this contact into a single concatenated string
// We don't index members in a contact group because it's only confusing when searching.
StringBuilder emails = new StringBuilder();
for (String email : Contact.getEmailAddresses(emailFields, getFields(), DerefGroupMembersOption.NONE)) {
emails.append(email).append(',');
}
RFC822AddressTokenStream to = new RFC822AddressTokenStream(emails.toString());
String emailStrTokens = StringUtil.join(" ", to.getAllTokens());
StringBuilder searchText = new StringBuilder(emailStrTokens).append(' ');
appendContactField(searchText, this, ContactConstants.A_company);
appendContactField(searchText, this, ContactConstants.A_phoneticCompany);
appendContactField(searchText, this, ContactConstants.A_firstName);
appendContactField(searchText, this, ContactConstants.A_phoneticFirstName);
appendContactField(searchText, this, ContactConstants.A_lastName);
appendContactField(searchText, this, ContactConstants.A_phoneticLastName);
appendContactField(searchText, this, ContactConstants.A_nickname);
appendContactField(searchText, this, ContactConstants.A_fullName);
// rebuild contentText here with the emailStr FIRST, then the other text.
// The email addresses should be first so that they have a higher search score than the other
// text
contentText = new StringBuilder(emailStrTokens).append(' ').append(contentText).append(' ').append(contentStrIn);
IndexDocument doc = new IndexDocument();
/* put the email addresses in the "To" field so they can be more easily searched */
doc.addTo(to);
/* put the name in the "From" field since the MailItem table uses 'Sender'*/
doc.addFrom(new RFC822AddressTokenStream(Contact.getFileAsString(contactFields)));
/* bug 11831 - put contact searchable data in its own field so wildcard search works better */
doc.addContactData(searchText.toString());
doc.addContent(contentText.toString());
doc.addPartName(LuceneFields.L_PARTNAME_CONTACT);
// add key:value pairs to the structured FIELD Lucene field
doc.addField(fields);
return doc;
}
use of com.zimbra.cs.index.IndexDocument in project zm-mailbox by Zimbra.
the class ParsedContact method analyzeAttachment.
private void analyzeAttachment(Attachment attach, StringBuilder contentText, boolean indexAttachments) throws MimeHandlerException, ObjectHandlerException, ServiceException {
String ctype = attach.getContentType();
MimeHandler handler = MimeHandlerManager.getMimeHandler(ctype, attach.getFilename());
assert (handler != null);
if (handler.isIndexingEnabled()) {
handler.init(attach);
handler.setPartName(attach.getPartName());
handler.setFilename(attach.getFilename());
handler.setSize(attach.getSize());
if (indexAttachments && !DebugConfig.disableIndexingAttachmentsTogether) {
// add ALL TEXT from EVERY PART to the toplevel body content.
// This is necessary for queries with multiple words -- where
// one word is in the body and one is in a sub-attachment.
//
// If attachment indexing is disabled, then we only add the main body and
// text parts...
contentText.append(contentText.length() == 0 ? "" : " ").append(handler.getContent());
}
if (indexAttachments && !DebugConfig.disableIndexingAttachmentsSeparately) {
// Each non-text MIME part is also indexed as a separate
// Lucene document. This is necessary so that we can tell the
// client what parts match if a search matched a particular
// part.
org.apache.lucene.document.Document doc = handler.getDocument();
if (doc != null) {
IndexDocument idoc = new IndexDocument(doc);
idoc.addSortSize(attach.getSize());
indexDocs.add(idoc);
}
}
}
}
use of com.zimbra.cs.index.IndexDocument in project zm-mailbox by Zimbra.
the class ParsedMessage method handleParseError.
/**
* Log the error and index minimum information.
*
* @param mpi MIME info
* @param error error to handle
*/
private void handleParseError(MPartInfo mpi, Throwable error) {
numParseErrors++;
LOG.warn("Unable to parse part=%s filename=%s content-type=%s message-id=%s", mpi.getPartName(), mpi.getFilename(), mpi.getContentType(), getMessageID(), error);
if (ConversionException.isTemporaryCauseOf(error)) {
temporaryAnalysisFailure = true;
}
if (!Strings.isNullOrEmpty(mpi.getFilename())) {
filenames.add(mpi.getFilename());
}
IndexDocument doc = new IndexDocument(new Document());
doc.addMimeType(new MimeTypeTokenStream(mpi.getContentType()));
doc.addPartName(mpi.getPartName());
doc.addFilename(mpi.getFilename());
try {
doc.addSortSize(mpi.getMimePart().getSize());
} catch (MessagingException ignore) {
}
luceneDocuments.add(setLuceneHeadersFromContainer(doc));
}
Aggregations