Search in sources :

Example 1 with RepositoryDocument

use of org.apache.manifoldcf.agents.interfaces.RepositoryDocument in project manifoldcf by apache.

the class AlfrescoRepositoryConnector method processDocuments.

/**
 * Process a set of documents.
 * This is the method that should cause each document to be fetched, processed, and the results either added
 * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
 * The document specification allows this class to filter what is done based on the job.
 * The connector will be connected before this method can be called.
 *@param documentIdentifiers is the set of document identifiers to process.
 *@param statuses are the currently-stored document versions for each document in the set of document identifiers
 * passed in above.
 *@param activities is the interface this method should use to queue up new document references
 * and ingest documents.
 *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
 *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
 */
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption {
    for (String documentIdentifier : documentIdentifiers) {
        // Prepare to access the document
        String nodeReference = documentIdentifier;
        String uuid = NodeUtils.getUuidFromNodeReference(nodeReference);
        if (Logging.connectors.isDebugEnabled())
            Logging.connectors.debug("Alfresco: Processing document identifier '" + nodeReference + "'");
        Reference reference = new Reference();
        reference.setStore(SearchUtils.STORE);
        reference.setUuid(uuid);
        Predicate predicate = new Predicate();
        predicate.setStore(SearchUtils.STORE);
        predicate.setNodes(new Reference[] { reference });
        Node resultNode = null;
        try {
            resultNode = NodeUtils.get(endpoint, username, password, socketTimeout, session, predicate);
        } catch (IOException e) {
            Logging.connectors.warn("Alfresco: IOException getting node: " + e.getMessage(), e);
            handleIOException(e);
        }
        NamedValue[] properties = resultNode.getProperties();
        boolean isDocument;
        String versionString = "";
        if (properties != null)
            isDocument = ContentModelUtils.isDocument(properties);
        else
            isDocument = false;
        if (isDocument) {
            boolean isVersioned = NodeUtils.isVersioned(resultNode.getAspects());
            if (isVersioned) {
                versionString = NodeUtils.getVersionLabel(properties);
            }
        }
        if (versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier, versionString)) {
            // Need to (re)index
            String errorCode = "OK";
            String errorDesc = StringUtils.EMPTY;
            Long fileLengthLong = null;
            long startTime = System.currentTimeMillis();
            try {
                try {
                    boolean isFolder = ContentModelUtils.isFolder(endpoint, username, password, socketTimeout, session, reference);
                    // a generic node in Alfresco could have child-associations
                    if (isFolder) {
                        // queue all the children of the folder
                        QueryResult queryResult = SearchUtils.getChildren(endpoint, username, password, socketTimeout, session, reference);
                        ResultSet resultSet = queryResult.getResultSet();
                        ResultSetRow[] resultSetRows = resultSet.getRows();
                        for (ResultSetRow resultSetRow : resultSetRows) {
                            NamedValue[] childProperties = resultSetRow.getColumns();
                            String childNodeReference = PropertiesUtils.getNodeReference(childProperties);
                            activities.addDocumentReference(childNodeReference, nodeReference, RELATIONSHIP_CHILD);
                        }
                    }
                } catch (IOException e) {
                    errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                    errorDesc = e.getMessage();
                    Logging.connectors.warn("Alfresco: IOException finding children: " + e.getMessage(), e);
                    handleIOException(e);
                }
                // a generic node in Alfresco could also have binaries content
                if (isDocument) {
                    // this is a content to ingest
                    InputStream is = null;
                    long fileLength = 0;
                    try {
                        // properties ingestion
                        RepositoryDocument rd = new RepositoryDocument();
                        List<NamedValue> contentProperties = PropertiesUtils.getContentProperties(properties);
                        PropertiesUtils.ingestProperties(rd, properties, contentProperties);
                        // binaries ingestion - in Alfresco we could have more than one binary for each node (custom content models)
                        for (NamedValue contentProperty : contentProperties) {
                            // we are ingesting all the binaries defined as d:content property in the Alfresco content model
                            Content binary = ContentReader.read(endpoint, username, password, socketTimeout, session, predicate, contentProperty.getName());
                            fileLength = binary.getLength();
                            is = ContentReader.getBinary(endpoint, binary, username, password, socketTimeout, session);
                            rd.setBinary(is, fileLength);
                            // id is the node reference only if the node has an unique content stream
                            // For a node with a single d:content property: id = node reference
                            String id = PropertiesUtils.getNodeReference(properties);
                            // The QName of a property of type d:content will be appended to the node reference
                            if (contentProperties.size() > 1) {
                                id = id + INGESTION_SEPARATOR_FOR_MULTI_BINARY + contentProperty.getName();
                            }
                            // the document uri is related to the specific d:content property available in the node
                            // we want to ingest each content stream that are nested in a single node
                            String documentURI = binary.getUrl();
                            activities.ingestDocumentWithException(documentIdentifier, id, versionString, documentURI, rd);
                            fileLengthLong = new Long(fileLength);
                        }
                        AuthenticationUtils.endSession();
                    } catch (ParseException e) {
                        errorCode = "PARSEEXCEPTION";
                        errorDesc = e.getMessage();
                        Logging.connectors.warn("Alfresco: Error during the reading process of dates: " + e.getMessage(), e);
                        handleParseException(e);
                    } catch (IOException e) {
                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                        errorDesc = e.getMessage();
                        Logging.connectors.warn("Alfresco: IOException: " + e.getMessage(), e);
                        handleIOException(e);
                    } finally {
                        session = null;
                        try {
                            if (is != null) {
                                is.close();
                            }
                        } catch (InterruptedIOException e) {
                            errorCode = null;
                            throw new ManifoldCFException(e.getMessage(), e, ManifoldCFException.INTERRUPTED);
                        } catch (IOException e) {
                            errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                            errorDesc = e.getMessage();
                            Logging.connectors.warn("Alfresco: IOException closing file input stream: " + e.getMessage(), e);
                            handleIOException(e);
                        }
                    }
                }
            } catch (ManifoldCFException e) {
                if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                    errorCode = null;
                throw e;
            } finally {
                if (errorCode != null)
                    activities.recordActivity(new Long(startTime), ACTIVITY_READ, fileLengthLong, nodeReference, errorCode, errorDesc, null);
            }
        }
    }
}
Also used : InterruptedIOException(java.io.InterruptedIOException) Reference(org.alfresco.webservice.types.Reference) InputStream(java.io.InputStream) Node(org.alfresco.webservice.types.Node) SpecificationNode(org.apache.manifoldcf.core.interfaces.SpecificationNode) NamedValue(org.alfresco.webservice.types.NamedValue) ResultSetRow(org.alfresco.webservice.types.ResultSetRow) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) Predicate(org.alfresco.webservice.types.Predicate) QueryResult(org.alfresco.webservice.repository.QueryResult) Content(org.alfresco.webservice.content.Content) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) ResultSet(org.alfresco.webservice.types.ResultSet) RepositoryDocument(org.apache.manifoldcf.agents.interfaces.RepositoryDocument) ParseException(java.text.ParseException)

Example 2 with RepositoryDocument

use of org.apache.manifoldcf.agents.interfaces.RepositoryDocument in project manifoldcf by apache.

the class SearchBloxDocumentTest method initRepoDocument.

private RepositoryDocument initRepoDocument() throws ManifoldCFException {
    RepositoryDocument realRepoDoc = new RepositoryDocument();
    String binaryContent = "I am the binary content of an Amazing Document";
    InputStream stream = new ByteArrayInputStream(binaryContent.getBytes(StandardCharsets.UTF_8));
    realRepoDoc.addField("title", "I am a nice title");
    realRepoDoc.addField("content", "I am a nice content in english!");
    realRepoDoc.addField("description", "I am a little tiny description");
    realRepoDoc.addField("meta1", "I am META1!");
    realRepoDoc.addField("meta2", "I am META2!");
    realRepoDoc.setMimeType("html");
    realRepoDoc.setBinary(stream, 100);
    realRepoDoc.setCreatedDate(new Date(System.currentTimeMillis()));
    realRepoDoc.setSecurityACL(RepositoryDocument.SECURITY_TYPE_SHARE, new String[] { "user1", "user2", "user3" });
    realRepoDoc.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT, new String[] { "user12", "user22", "user33" });
    realRepoDoc.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_SHARE, new String[] { "user4", "user5" });
    realRepoDoc.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT, new String[] { "user42", "user52" });
    // allowAttributeName + aclType
    return realRepoDoc;
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) RepositoryDocument(org.apache.manifoldcf.agents.interfaces.RepositoryDocument)

Example 3 with RepositoryDocument

use of org.apache.manifoldcf.agents.interfaces.RepositoryDocument in project manifoldcf by apache.

the class EmailConnector method processDocuments.

/**
 * Process a set of documents.
 * This is the method that should cause each document to be fetched, processed, and the results either added
 * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
 * The document specification allows this class to filter what is done based on the job.
 * The connector will be connected before this method can be called.
 *@param documentIdentifiers is the set of document identifiers to process.
 *@param statuses are the currently-stored document versions for each document in the set of document identifiers
 * passed in above.
 *@param activities is the interface this method should use to queue up new document references
 * and ingest documents.
 *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
 *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
 */
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption {
    List<String> requiredMetadata = new ArrayList<String>();
    boolean useEmailExtractor = false;
    for (int i = 0; i < spec.getChildCount(); i++) {
        SpecificationNode sn = spec.getChild(i);
        if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
            String metadataAttribute = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
            requiredMetadata.add(metadataAttribute);
        }
        if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
            useEmailExtractor = true;
        }
    }
    // Keep a cached set of open folders
    Map<String, Folder> openFolders = new HashMap<String, Folder>();
    try {
        for (String documentIdentifier : documentIdentifiers) {
            final Integer attachmentIndex = extractAttachmentNumberFromDocumentIdentifier(documentIdentifier);
            if (attachmentIndex == null) {
                // It's an email
                // NOT empty; we need to make ManifoldCF understand that this is a document that never will change.
                String versionString = "_" + urlTemplate;
                // Check if we need to index
                if (!activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))
                    continue;
                String compositeID = documentIdentifier;
                String version = versionString;
                String folderName = extractFolderNameFromDocumentIdentifier(compositeID);
                String id = extractEmailIDFromDocumentIdentifier(compositeID);
                String errorCode = null;
                String errorDesc = null;
                Long fileLengthLong = null;
                long startTime = System.currentTimeMillis();
                try {
                    try {
                        Folder folder = openFolders.get(folderName);
                        if (folder == null) {
                            getSession();
                            OpenFolderThread oft = new OpenFolderThread(session, folderName);
                            oft.start();
                            folder = oft.finishUp();
                            openFolders.put(folderName, folder);
                        }
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("Email: Processing document identifier '" + compositeID + "'");
                        SearchTerm messageIDTerm = new MessageIDTerm(id);
                        getSession();
                        SearchMessagesThread smt = new SearchMessagesThread(session, folder, messageIDTerm);
                        smt.start();
                        Message[] message = smt.finishUp();
                        String msgURL = makeDocumentURI(urlTemplate, folderName, id);
                        Message msg = null;
                        for (Message msg2 : message) {
                            msg = msg2;
                        }
                        if (msg == null) {
                            // email was not found
                            activities.deleteDocument(documentIdentifier);
                            continue;
                        }
                        if (!activities.checkURLIndexable(msgURL)) {
                            errorCode = activities.EXCLUDED_URL;
                            errorDesc = "Excluded because of URL ('" + msgURL + "')";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        long fileLength = msg.getSize();
                        if (!activities.checkLengthIndexable(fileLength)) {
                            errorCode = activities.EXCLUDED_LENGTH;
                            errorDesc = "Excluded because of length (" + fileLength + ")";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        Date sentDate = msg.getSentDate();
                        if (!activities.checkDateIndexable(sentDate)) {
                            errorCode = activities.EXCLUDED_DATE;
                            errorDesc = "Excluded because of date (" + sentDate + ")";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        String mimeType = "text/plain";
                        if (!activities.checkMimeTypeIndexable(mimeType)) {
                            errorCode = activities.EXCLUDED_MIMETYPE;
                            errorDesc = "Excluded because of mime type ('" + mimeType + "')";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        RepositoryDocument rd = new RepositoryDocument();
                        rd.setFileName(msg.getFileName());
                        rd.setMimeType(mimeType);
                        rd.setCreatedDate(sentDate);
                        rd.setModifiedDate(sentDate);
                        for (String metadata : requiredMetadata) {
                            if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
                                Address[] to = msg.getRecipients(Message.RecipientType.TO);
                                if (to != null) {
                                    String[] toStr = new String[to.length];
                                    int j = 0;
                                    for (Address address : to) {
                                        toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                                        j++;
                                    }
                                    rd.addField(EmailConfig.EMAIL_TO, toStr);
                                }
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_FROM)) {
                                Address[] from = msg.getFrom();
                                String[] fromStr = new String[from.length];
                                int j = 0;
                                for (Address address : from) {
                                    fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                                    j++;
                                }
                                rd.addField(EmailConfig.EMAIL_FROM, fromStr);
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
                                String subject = msg.getSubject();
                                rd.addField(EmailConfig.EMAIL_SUBJECT, subject);
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
                                rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_ENCODING)) {
                                Object o = msg.getContent();
                                if (o != null) {
                                    if (o instanceof Multipart) {
                                        Multipart mp = (Multipart) o;
                                        String[] encoding = new String[mp.getCount()];
                                        for (int k = 0, n = mp.getCount(); k < n; k++) {
                                            Part part = mp.getBodyPart(k);
                                            if (isAttachment(part)) {
                                                final String[] fileSplit = part.getFileName().split("\\?");
                                                if (fileSplit.length > 1) {
                                                    encoding[k] = fileSplit[1];
                                                } else {
                                                    encoding[k] = "";
                                                }
                                            }
                                        }
                                        rd.addField(EmailConfig.ENCODING_FIELD, encoding);
                                    } else if (o instanceof String) {
                                        rd.addField(EmailConfig.ENCODING_FIELD, "");
                                    }
                                }
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_MIMETYPE)) {
                                Object o = msg.getContent();
                                if (o != null) {
                                    if (o instanceof Multipart) {
                                        Multipart mp = (Multipart) o;
                                        String[] MIMEType = new String[mp.getCount()];
                                        for (int k = 0, n = mp.getCount(); k < n; k++) {
                                            Part part = mp.getBodyPart(k);
                                            if (isAttachment(part)) {
                                                MIMEType[k] = part.getContentType();
                                            }
                                        }
                                        rd.addField(EmailConfig.MIMETYPE_FIELD, MIMEType);
                                    } else if (o instanceof String) {
                                        rd.addField(EmailConfig.MIMETYPE_FIELD, "");
                                    }
                                }
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENTNAME)) {
                                Object o = msg.getContent();
                                if (o != null) {
                                    if (o instanceof Multipart) {
                                        Multipart mp = (Multipart) o;
                                        String[] attachmentNames = new String[mp.getCount()];
                                        for (int k = 0, n = mp.getCount(); k < n; k++) {
                                            Part part = mp.getBodyPart(k);
                                            if (isAttachment(part)) {
                                                attachmentNames[k] = part.getFileName();
                                            }
                                        }
                                        rd.addField(EmailConfig.ATTACHMENTNAME_FIELD, attachmentNames);
                                    } else if (o instanceof String) {
                                        rd.addField(EmailConfig.ATTACHMENTNAME_FIELD, "");
                                    }
                                }
                            }
                        }
                        // Content includes both body and attachments,
                        // Body will be set as content and attachments will be indexed as separate documents.
                        final EmailContent bodyContent = extractBodyContent(msg);
                        if (bodyContent != null) {
                            rd.setMimeType(bodyContent.getMimeType());
                            InputStream is = new ByteArrayInputStream(bodyContent.getContent().getBytes(StandardCharsets.UTF_8));
                            try {
                                rd.setBinary(is, fileLength);
                                activities.ingestDocumentWithException(documentIdentifier, version, msgURL, rd);
                                errorCode = "OK";
                                fileLengthLong = new Long(fileLength);
                            } finally {
                                is.close();
                            }
                        }
                        // If we're supposed to deal with attachments, this is the time to queue them up
                        if (attachmentUrlTemplate != null) {
                            if (msg.getContent() != null && msg.getContent() instanceof Multipart) {
                                final Multipart mp = (Multipart) msg.getContent();
                                final int numAttachments = mp.getCount();
                                for (int i = 0; i < numAttachments; i++) {
                                    if (isAttachment(mp.getBodyPart(i))) {
                                        activities.addDocumentReference(documentIdentifier + ":" + i);
                                    }
                                }
                            }
                        }
                    } catch (InterruptedException e) {
                        throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
                    } catch (MessagingException e) {
                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                        errorDesc = e.getMessage();
                        handleMessagingException(e, "processing email");
                    } catch (IOException e) {
                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                        errorDesc = e.getMessage();
                        handleIOException(e, "processing email");
                        throw new ManifoldCFException(e.getMessage(), e);
                    }
                } catch (ManifoldCFException e) {
                    if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                        errorCode = null;
                    throw e;
                } finally {
                    if (errorCode != null)
                        activities.recordActivity(new Long(startTime), EmailConfig.ACTIVITY_FETCH, fileLengthLong, documentIdentifier, errorCode, errorDesc, null);
                }
            } else {
                // It's a specific attachment
                final int attachmentNumber = attachmentIndex;
                // NOT empty; we need to make ManifoldCF understand that this is a document that never will change.
                String versionString = "_" + attachmentUrlTemplate;
                // Check if we need to index
                if (!activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))
                    continue;
                String compositeID = documentIdentifier;
                String version = versionString;
                String folderName = extractFolderNameFromDocumentIdentifier(compositeID);
                String id = extractEmailIDFromDocumentIdentifier(compositeID);
                String errorCode = null;
                String errorDesc = null;
                Long fileLengthLong = null;
                long startTime = System.currentTimeMillis();
                try {
                    try {
                        Folder folder = openFolders.get(folderName);
                        if (folder == null) {
                            getSession();
                            OpenFolderThread oft = new OpenFolderThread(session, folderName);
                            oft.start();
                            folder = oft.finishUp();
                            openFolders.put(folderName, folder);
                        }
                        if (Logging.connectors.isDebugEnabled())
                            Logging.connectors.debug("Email: Processing document identifier '" + documentIdentifier + "'");
                        SearchTerm messageIDTerm = new MessageIDTerm(id);
                        getSession();
                        SearchMessagesThread smt = new SearchMessagesThread(session, folder, messageIDTerm);
                        smt.start();
                        Message[] message = smt.finishUp();
                        String msgURL = makeDocumentURI(attachmentUrlTemplate, folderName, id, attachmentNumber);
                        Message msg = null;
                        for (Message msg2 : message) {
                            msg = msg2;
                        }
                        if (msg == null) {
                            // email was not found
                            activities.deleteDocument(documentIdentifier);
                            continue;
                        }
                        if (!activities.checkURLIndexable(msgURL)) {
                            errorCode = activities.EXCLUDED_URL;
                            errorDesc = "Excluded because of URL ('" + msgURL + "')";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        final Date sentDate = msg.getSentDate();
                        if (!activities.checkDateIndexable(sentDate)) {
                            errorCode = activities.EXCLUDED_DATE;
                            errorDesc = "Excluded because of date (" + sentDate + ")";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        final Multipart mp = (Multipart) msg.getContent();
                        if (mp.getCount() <= attachmentNumber) {
                            activities.deleteDocument(documentIdentifier);
                            continue;
                        }
                        final Part part = mp.getBodyPart(attachmentNumber);
                        final long fileLength = part.getSize();
                        if (!activities.checkLengthIndexable(fileLength)) {
                            errorCode = activities.EXCLUDED_LENGTH;
                            errorDesc = "Excluded because of length (" + fileLength + ")";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        final String origMimeType = part.getContentType();
                        final String mimeType;
                        // Example: "application/msword; name=SampleDOCFile_100kb.doc"
                        if (origMimeType == null || origMimeType.indexOf(";") == -1) {
                            mimeType = origMimeType;
                        } else {
                            mimeType = origMimeType.substring(0, origMimeType.indexOf(";"));
                        }
                        if (!activities.checkMimeTypeIndexable(mimeType)) {
                            errorCode = activities.EXCLUDED_MIMETYPE;
                            errorDesc = "Excluded because of mime type ('" + mimeType + "')";
                            activities.noDocument(documentIdentifier, version);
                            continue;
                        }
                        RepositoryDocument rd = new RepositoryDocument();
                        rd.setFileName(part.getFileName());
                        rd.setMimeType(mimeType);
                        rd.setCreatedDate(sentDate);
                        rd.setModifiedDate(sentDate);
                        for (String metadata : requiredMetadata) {
                            if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
                                Address[] to = msg.getRecipients(Message.RecipientType.TO);
                                if (to != null) {
                                    String[] toStr = new String[to.length];
                                    int j = 0;
                                    for (Address address : to) {
                                        toStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                                        j++;
                                    }
                                    rd.addField(EmailConfig.EMAIL_TO, toStr);
                                }
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_FROM)) {
                                Address[] from = msg.getFrom();
                                String[] fromStr = new String[from.length];
                                int j = 0;
                                for (Address address : from) {
                                    fromStr[j] = useEmailExtractor ? extractEmailAddress(address.toString()) : address.toString();
                                    j++;
                                }
                                rd.addField(EmailConfig.EMAIL_FROM, fromStr);
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
                                String subject = msg.getSubject();
                                // Attachments may have a field named "subject". So, different field name is used not to clash.
                                rd.addField(EmailConfig.MAILSUBJECT_FIELD, subject);
                            } else if (metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
                                rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
                            }
                        }
                        final InputStream is = part.getInputStream();
                        try {
                            rd.setBinary(is, fileLength);
                            activities.ingestDocumentWithException(documentIdentifier, version, msgURL, rd);
                            errorCode = "OK";
                            fileLengthLong = new Long(fileLength);
                        } finally {
                            is.close();
                        }
                    } catch (InterruptedException e) {
                        throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
                    } catch (MessagingException e) {
                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                        errorDesc = e.getMessage();
                        handleMessagingException(e, "processing email attachment");
                    } catch (IOException e) {
                        errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                        errorDesc = e.getMessage();
                        handleIOException(e, "processing email attachment");
                        throw new ManifoldCFException(e.getMessage(), e);
                    }
                } catch (ManifoldCFException e) {
                    if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                        errorCode = null;
                    throw e;
                } finally {
                    if (errorCode != null)
                        activities.recordActivity(new Long(startTime), EmailConfig.ACTIVITY_FETCH, fileLengthLong, documentIdentifier, errorCode, errorDesc, null);
                }
            }
        }
    } finally {
        for (Folder f : openFolders.values()) {
            try {
                CloseFolderThread cft = new CloseFolderThread(session, f);
                cft.start();
                cft.finishUp();
            } catch (InterruptedException e) {
                throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED);
            } catch (MessagingException e) {
                handleMessagingException(e, "closing folders");
            }
        }
    }
}
Also used : MimeMessage(javax.mail.internet.MimeMessage) RepositoryDocument(org.apache.manifoldcf.agents.interfaces.RepositoryDocument) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) InterruptedIOException(java.io.InterruptedIOException) ByteArrayInputStream(java.io.ByteArrayInputStream)

Example 4 with RepositoryDocument

use of org.apache.manifoldcf.agents.interfaces.RepositoryDocument in project manifoldcf by apache.

the class DropboxRepositoryConnector method processDocuments.

/**
 * Process a set of documents.
 * This is the method that should cause each document to be fetched, processed, and the results either added
 * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
 * The document specification allows this class to filter what is done based on the job.
 * The connector will be connected before this method can be called.
 *@param documentIdentifiers is the set of document identifiers to process.
 *@param statuses are the currently-stored document versions for each document in the set of document identifiers
 * passed in above.
 *@param activities is the interface this method should use to queue up new document references
 * and ingest documents.
 *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
 *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
 */
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption {
    Logging.connectors.debug("DROPBOX: Inside processDocuments");
    // Forced acls
    String[] acls = getAcls(spec);
    // Sort it,
    java.util.Arrays.sort(acls);
    for (String documentIdentifier : documentIdentifiers) {
        getSession();
        String versionString;
        GetObjectThread objt = new GetObjectThread(documentIdentifier);
        objt.start();
        try {
            objt.finishUp();
        } catch (InterruptedException e) {
            objt.interrupt();
            throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
        } catch (DropboxException e) {
            Logging.connectors.warn("DROPBOX: Error getting object: " + e.getMessage(), e);
            handleDropboxException(e);
        }
        DropboxAPI.Entry dropboxObject = objt.getResponse();
        if (dropboxObject.isDir) {
            // a folder will always be processed
            versionString = StringUtils.EMPTY;
            // adding all the children + subdirs for a folder
            List<DropboxAPI.Entry> children = dropboxObject.contents;
            for (DropboxAPI.Entry child : children) {
                activities.addDocumentReference(child.path, documentIdentifier, RELATIONSHIP_CHILD);
            }
            activities.noDocument(documentIdentifier, versionString);
            continue;
        }
        if (dropboxObject.isDeleted) {
            activities.deleteDocument(documentIdentifier);
            continue;
        }
        if (StringUtils.isEmpty(dropboxObject.rev)) {
            // a document that doesn't contain versioning information will never be processed
            activities.deleteDocument(documentIdentifier);
            continue;
        }
        StringBuilder sb = new StringBuilder();
        // Acls
        packList(sb, acls, '+');
        if (acls.length > 0) {
            sb.append('+');
            pack(sb, defaultAuthorityDenyToken, '+');
        } else
            sb.append('-');
        sb.append(dropboxObject.rev);
        versionString = sb.toString();
        if (!activities.checkDocumentNeedsReindexing(documentIdentifier, versionString))
            continue;
        long startTime = System.currentTimeMillis();
        String errorCode = null;
        String errorDesc = null;
        Long fileSize = null;
        String nodeId = documentIdentifier;
        String version = versionString;
        try {
            // Length in bytes
            long fileLength = dropboxObject.bytes;
            if (!activities.checkLengthIndexable(fileLength)) {
                errorCode = activities.EXCLUDED_LENGTH;
                errorDesc = "Document excluded because of length (" + fileLength + ")";
                activities.noDocument(documentIdentifier, versionString);
                continue;
            }
            // documentURI
            String documentURI = dropboxObject.path;
            if (!activities.checkURLIndexable(documentURI)) {
                errorCode = activities.EXCLUDED_URL;
                errorDesc = "Document excluded because of URL ('" + documentURI + "')";
                activities.noDocument(documentIdentifier, versionString);
                continue;
            }
            // Modified date
            Date modifiedDate;
            if (dropboxObject.modified != null)
                modifiedDate = com.dropbox.client2.RESTUtility.parseDate(dropboxObject.modified);
            else
                modifiedDate = null;
            if (!activities.checkDateIndexable(modifiedDate)) {
                errorCode = activities.EXCLUDED_DATE;
                errorDesc = "Document excluded because of date (" + modifiedDate + ")";
                activities.noDocument(documentIdentifier, versionString);
                continue;
            }
            // Mime type
            String mimeType = dropboxObject.mimeType;
            if (!activities.checkMimeTypeIndexable(mimeType)) {
                errorCode = activities.EXCLUDED_MIMETYPE;
                errorDesc = "Document excluded because of mime type ('" + mimeType + "')";
                activities.noDocument(documentIdentifier, versionString);
                continue;
            }
            // content ingestion
            RepositoryDocument rd = new RepositoryDocument();
            if (acls.length > 0) {
                rd.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT, acls);
                String[] denyAclArray = new String[] { defaultAuthorityDenyToken };
                rd.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT, denyAclArray);
            }
            if (dropboxObject.path != null)
                rd.setFileName(dropboxObject.path);
            if (dropboxObject.mimeType != null)
                rd.setMimeType(dropboxObject.mimeType);
            if (dropboxObject.modified != null)
                rd.setModifiedDate(modifiedDate);
            // There doesn't appear to be a created date...
            rd.addField("Modified", dropboxObject.modified);
            rd.addField("Size", dropboxObject.size);
            rd.addField("Path", dropboxObject.path);
            rd.addField("Root", dropboxObject.root);
            rd.addField("ClientMtime", dropboxObject.clientMtime);
            rd.addField("mimeType", dropboxObject.mimeType);
            rd.addField("rev", dropboxObject.rev);
            getSession();
            BackgroundStreamThread t = new BackgroundStreamThread(nodeId);
            t.start();
            try {
                boolean wasInterrupted = false;
                try {
                    InputStream is = t.getSafeInputStream();
                    try {
                        rd.setBinary(is, fileLength);
                        activities.ingestDocumentWithException(nodeId, version, documentURI, rd);
                        // No errors.  Record the fact that we made it.
                        errorCode = "OK";
                        fileSize = new Long(fileLength);
                    } finally {
                        is.close();
                    }
                } catch (java.net.SocketTimeoutException e) {
                    throw e;
                } catch (InterruptedIOException e) {
                    wasInterrupted = true;
                    throw e;
                } catch (ManifoldCFException e) {
                    if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                        wasInterrupted = true;
                    throw e;
                } finally {
                    if (!wasInterrupted)
                        // This does a join
                        t.finishUp();
                }
            } catch (InterruptedException e) {
                // We were interrupted out of the join, most likely.  Before we abandon the thread,
                // send a courtesy interrupt.
                t.interrupt();
                throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
            } catch (java.net.SocketTimeoutException e) {
                errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                errorDesc = e.getMessage();
                handleIOException(e);
            } catch (InterruptedIOException e) {
                t.interrupt();
                throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
            } catch (IOException e) {
                errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                errorDesc = e.getMessage();
                handleIOException(e);
            } catch (DropboxException e) {
                Logging.connectors.warn("DROPBOX: Error getting stream: " + e.getMessage(), e);
                errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                errorDesc = e.getMessage();
                handleDropboxException(e);
            }
        } catch (ManifoldCFException e) {
            if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
                errorCode = null;
            throw e;
        } finally {
            if (errorCode != null)
                activities.recordActivity(new Long(startTime), ACTIVITY_READ, fileSize, nodeId, errorCode, errorDesc, null);
        }
    }
}
Also used : InterruptedIOException(java.io.InterruptedIOException) InputStream(java.io.InputStream) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) Date(java.util.Date) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) DropboxException(com.dropbox.client2.exception.DropboxException) DropboxAPI(com.dropbox.client2.DropboxAPI) RepositoryDocument(org.apache.manifoldcf.agents.interfaces.RepositoryDocument)

Example 5 with RepositoryDocument

use of org.apache.manifoldcf.agents.interfaces.RepositoryDocument in project manifoldcf by apache.

the class TikaExtractor method addOrReplaceDocumentWithException.

/**
 * Add (or replace) a document in the output data store using the connector. This method presumes that the connector object has been configured, and it is thus able to communicate with the output
 * data store should that be necessary. The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the output
 * description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode an output description string in order to determine
 * what should be done.
 *
 * @param documentURI         is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process and serve the document. This URI is
 *                            constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
 * @param pipelineDescription is the description string that was constructed for this document by the getOutputDescription() method.
 * @param document            is the document data to be processed (handed to the output data store).
 * @param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null.
 * @param activities          is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity, or sending a modified
 *                            document to the next stage in the pipeline.
 * @return the document status (accepted or permanently rejected).
 * @throws IOException only if there's a stream error reading the document data.
 */
@Override
public int addOrReplaceDocumentWithException(final String documentURI, final VersionContext pipelineDescription, final RepositoryDocument document, final String authorityNameString, final IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException {
    // text/plain;charset=utf-8
    if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8")) {
        activities.noDocument();
        activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI, activities.EXCLUDED_MIMETYPE, "Downstream pipeline rejected mime type 'text/plain;charset=utf-8'");
        return DOCUMENTSTATUS_REJECTED;
    }
    final SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
    getSession();
    // Tika server variables
    CloseableHttpResponse response = null;
    // Tika's API reads from an input stream and writes to an output Writer.
    // Since a RepositoryDocument includes readers and inputstreams exclusively,
    // AND all downstream
    // processing needs to occur in a ManifoldCF thread, we have some
    // constraints on the architecture we need to get this done:
    // (1) The principle worker thread must call the downstream pipeline send()
    // method.
    // (2) The callee of the send() method must call a reader in the Repository
    // Document.
    // (3) The Reader, if its databuffer is empty, must pull more data from the
    // original input stream and hand it to Tika, which populates the Reader's
    // databuffer.
    // So all this can be done in one thread, with some work, and the creation
    // of a special InputStream or Reader implementation. Where it fails,
    // though, is the
    // requirement that tika-extracted metadata be included in the
    // RepositoryDocument right from the beginning. Effectively this means that
    // the entire document
    // must be parsed before it is handed downstream -- so basically a temporary
    // file (or in-memory buffer if small enough) must be created.
    // Instead of the elegant flow above, we have the following:
    // (1) Create a temporary file (or in-memory buffer if file is small enough)
    // (2) Run Tika to completion, streaming content output to temporary file
    // (3) Modify RepositoryDocument to read from temporary file, and include
    // Tika-extracted metadata
    // (4) Call downstream document processing
    // Prepare the destination storage
    DestinationStorage ds;
    if (document.getBinaryLength() <= inMemoryMaximumFile) {
        ds = new MemoryDestinationStorage((int) document.getBinaryLength());
    } else {
        ds = new FileDestinationStorage();
    }
    try {
        final Map<String, List<String>> metadata = new HashMap<>();
        if (document.getFileName() != null) {
            metadata.put(TikaMetadataKeys.RESOURCE_NAME_KEY, new ArrayList<>());
            metadata.put("stream_name", new ArrayList<>());
            metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY).add(document.getFileName());
            metadata.get("stream_name").add(document.getFileName());
        }
        metadata.put("stream_size", new ArrayList<>());
        metadata.get("stream_size").add(String.valueOf(document.getBinaryLength()));
        // We only log the extraction
        final long startTime = System.currentTimeMillis();
        String resultCode = "OK";
        String description = "";
        Long length = 0L;
        boolean truncated = false;
        boolean resources_limit = false;
        int tikaServerResultCode = 0;
        try {
            try {
                // option is set to true
                if (!isArchive(document.getFileName(), document.getMimeType()) || isArchive(document.getFileName(), document.getMimeType()) && sp.extractArchives) {
                    // Send document to the Tika Server
                    final HttpPut httpPut = new HttpPut(rmetaURI);
                    if (sp.writeLimit != -1) {
                        httpPut.addHeader("writeLimit", String.valueOf(sp.writeLimit));
                    }
                    if (sp.maxEmbeddedResources != -1) {
                        httpPut.addHeader("maxEmbeddedResources", String.valueOf(sp.maxEmbeddedResources));
                    }
                    final HttpEntity entity = new InputStreamEntity(document.getBinaryStream());
                    httpPut.setEntity(entity);
                    try {
                        response = this.httpClient.execute(tikaHost, httpPut);
                    } catch (final SocketTimeoutException e) {
                        // The document is probably too big ! So we don't retry it
                        resultCode = "TIKASERVERRESPONSETIMEOUT";
                        description = "Socket timeout while processing document " + documentURI + " : " + e.getMessage();
                        tikaServerResultCode = handleTikaServerError(description);
                    } catch (final SocketException e) {
                        // failure
                        if (!(e instanceof ConnectException) && !(e instanceof BindException) && !(e instanceof NoRouteToHostException) && !(e instanceof PortUnreachableException)) {
                            resultCode = "TIKASERVERSOCKETEXCEPTION";
                            description = "Socket exception while processing document " + documentURI + " : " + e.getMessage();
                            tikaServerResultCode = handleTikaServerError(description);
                            retryWithoutAbort(e);
                        } else {
                            // The tika server seams to be down : retry {retryNumber} times and abort the
                            // job if it fails on
                            // each retry
                            resultCode = "TIKASERVEREXCEPTION";
                            description = "Tika seemed to be down when requested to process document " + documentURI + " : " + e.getMessage();
                            tikaServerResultCode = handleTikaServerError(description);
                            triggerServiceInterruption(documentURI, e);
                        }
                    } catch (final NoHttpResponseException e) {
                        // Tika probably does not manage to process document in time (task timeout)
                        resultCode = "TIKASERVERNORESPONSEEXCEPTION";
                        description = "Tika does not manage to treat " + documentURI + " (potential task timeout): " + e.getMessage();
                        tikaServerResultCode = handleTikaServerError(description);
                    } catch (final IOException e) {
                        // Unknown problem with the Tika Server. Retry {retryNumber} times and abort
                        // the job if it fails on
                        // each retry
                        resultCode = "TIKASERVEREXCEPTION";
                        description = "Unknown Tika problem when processing document " + documentURI + " : " + e.getMessage();
                        tikaServerResultCode = handleTikaServerError(description);
                        triggerServiceInterruption(documentURI, e);
                    }
                    if (response != null) {
                        final int responseCode = response.getStatusLine().getStatusCode();
                        if (responseCode == 200 || responseCode == 204) {
                            try (final OutputStream os = ds.getOutputStream();
                                Writer w = new OutputStreamWriter(os, StandardCharsets.UTF_8.name());
                                InputStream is = response.getEntity().getContent()) {
                                final JsonFactory jfactory = new JsonFactory();
                                final JsonParser jParser = jfactory.createParser(is);
                                JsonToken token = null;
                                // Go to beginning of metadata
                                boolean inMetadata = false;
                                while (!inMetadata && (token = jParser.nextToken()) != null) {
                                    if (token == JsonToken.START_OBJECT) {
                                        inMetadata = true;
                                    }
                                }
                                int totalMetadataLength = 0;
                                boolean maxMetadataReached = false;
                                boolean metadataSkipped = false;
                                if (token != null) {
                                    while ((token = jParser.nextToken()) != null && token != JsonToken.END_OBJECT) {
                                        final int fieldNameLength = jParser.getTextLength();
                                        if (fieldNameLength <= maxMetadataNameLength) {
                                            final String fieldName = jParser.getCurrentName();
                                            if (fieldName != null) {
                                                if (fieldName.startsWith("X-Parsed-By")) {
                                                    skipMetadata(jParser);
                                                } else if (fieldName.contentEquals("X-TIKA:content")) {
                                                    // Consume content
                                                    jParser.nextToken();
                                                    length += jParser.getText(w);
                                                } else if (!fieldName.startsWith("X-TIKA")) {
                                                    token = jParser.nextToken();
                                                    if (!metadata.containsKey(fieldName)) {
                                                        totalMetadataLength += fieldName.length();
                                                        metadata.put(fieldName, new ArrayList<>());
                                                    }
                                                    if (token == JsonToken.START_ARRAY) {
                                                        while (jParser.nextToken() != JsonToken.END_ARRAY) {
                                                            if (jParser.getTextLength() <= sp.maxMetadataValueLength) {
                                                                final int totalMetadataLengthPreview = totalMetadataLength + jParser.getTextLength();
                                                                if (totalMetadataLengthPreview <= sp.totalMetadataLimit) {
                                                                    metadata.get(fieldName).add(jParser.getText());
                                                                    totalMetadataLength = totalMetadataLengthPreview;
                                                                } else {
                                                                    maxMetadataReached = true;
                                                                }
                                                            } else {
                                                                metadataSkipped = true;
                                                                if (Logging.ingest.isDebugEnabled()) {
                                                                    Logging.ingest.debug("Skip value of metadata " + fieldName + " of document " + documentURI + " because it exceeds the max value limit of " + sp.maxMetadataValueLength);
                                                                }
                                                            }
                                                        }
                                                    } else {
                                                        if (jParser.getTextLength() <= sp.maxMetadataValueLength) {
                                                            final int totalMetadataLengthPreview = totalMetadataLength + jParser.getTextLength();
                                                            if (totalMetadataLengthPreview <= sp.totalMetadataLimit) {
                                                                metadata.get(fieldName).add(jParser.getText());
                                                            } else {
                                                                maxMetadataReached = true;
                                                            }
                                                        } else {
                                                            metadataSkipped = true;
                                                            if (Logging.ingest.isDebugEnabled()) {
                                                                Logging.ingest.debug("Skip value of metadata " + fieldName + " of document " + documentURI + " because it exceeds the max value limit of " + sp.maxMetadataValueLength);
                                                            }
                                                        }
                                                    }
                                                    // Remove metadata if no data has been gathered
                                                    if (metadata.get(fieldName).isEmpty()) {
                                                        totalMetadataLength -= fieldName.length();
                                                        metadata.remove(fieldName);
                                                    }
                                                } else if (fieldName.startsWith("X-TIKA:EXCEPTION:")) {
                                                    boolean unknownException = false;
                                                    if (fieldName.contentEquals("X-TIKA:EXCEPTION:write_limit_reached")) {
                                                        resultCode = "TRUNCATEDOK";
                                                        truncated = true;
                                                    } else if (fieldName.contentEquals("X-TIKA:EXCEPTION:embedded_resource_limit_reached")) {
                                                        resources_limit = true;
                                                    } else {
                                                        unknownException = true;
                                                        resultCode = "TIKAEXCEPTION";
                                                        jParser.nextToken();
                                                        description += fieldName + ": " + jParser.getText() + System.lineSeparator();
                                                    }
                                                    if (!unknownException) {
                                                        skipMetadata(jParser);
                                                    }
                                                } else {
                                                    skipMetadata(jParser);
                                                }
                                            }
                                        } else {
                                            metadataSkipped = true;
                                            if (Logging.ingest.isDebugEnabled()) {
                                                Logging.ingest.debug("Skip a metadata of document " + documentURI + " because its name exceeds the max allowed length of " + maxMetadataNameLength);
                                            }
                                            skipMetadata(jParser);
                                        }
                                    }
                                    jParser.close();
                                }
                                if (maxMetadataReached) {
                                    description += "Some metadata have been skipped because the total metadata limit of " + sp.totalMetadataLimit + " has been reached" + System.lineSeparator();
                                } else if (metadataSkipped) {
                                    description += "Some metadata have been skipped because their names or values exceeded the limits" + System.lineSeparator();
                                }
                            }
                        } else if (responseCode == 503) {
                            // Service interruption; Tika trying to come up.
                            // Retry unlimited times, retryInterval ms between retries
                            resultCode = "TIKASERVERUNAVAILABLE";
                            description = "Tika Server was unavailable: 503 " + response.getStatusLine().getReasonPhrase();
                            tikaServerResultCode = handleTikaServerError(description);
                            Logging.ingest.warn("Tika Server unavailable, retrying...");
                            final long currentTime = System.currentTimeMillis();
                            throw new ServiceInterruption("Tika Server unavailable, retrying...", new Exception(description), currentTime + retryInterval, -1L, -1, false);
                        } else {
                            if (responseCode == 500) {
                                resultCode = "TIKASERVERERROR";
                                description = "Tika Server failed to parse document with the following error: " + response.getStatusLine().getReasonPhrase();
                                tikaServerResultCode = handleTikaServerError(description);
                            } else {
                                resultCode = "TIKASERVERREJECTS";
                                description = "Tika Server rejected document " + documentURI + " with the following reason: " + response.getStatusLine().getReasonPhrase();
                                tikaServerResultCode = handleTikaServerRejects(description);
                            }
                        }
                    }
                } else {
                    resultCode = "EXCLUDED";
                    description = "Detected as an archive file and the extract archives option is set to false";
                }
            } catch (final IOException e) {
                resultCode = "TIKASERVERRESPONSEISSUE";
                if (e.getMessage() != null) {
                    description = e.getMessage();
                }
                tikaServerResultCode = handleTikaServerException(e);
            } finally {
                if (response != null) {
                    response.close();
                }
            }
            if (!activities.checkLengthIndexable(ds.getBinaryLength())) {
                activities.noDocument();
                resultCode = activities.EXCLUDED_LENGTH;
                description = "Downstream pipeline rejected document with length " + ds.getBinaryLength();
                return DOCUMENTSTATUS_REJECTED;
            }
        } finally {
            // Before injecting activity record, clean the description as it can contains non ascii chars that can cause errors during SQL insertion
            description = description.replaceAll("[^\\x20-\\x7e]", "");
            // Log the extraction processing
            activities.recordActivity(startTime, ACTIVITY_EXTRACT, length, documentURI, resultCode, description);
        }
        // Parsing complete!
        // Create a copy of Repository Document
        final RepositoryDocument docCopy = document.duplicate();
        // Open new input stream
        final InputStream is = ds.getInputStream();
        // Get new stream length
        final long newBinaryLength = ds.getBinaryLength();
        try {
            docCopy.setBinary(is, newBinaryLength);
            // mapper eventually...
            for (String mName : metadata.keySet()) {
                String[] values = metadata.get(mName).toArray(new String[0]);
                // Only keep metadata if its name does not exceed 8k chars to avoid HTTP header error
                if (mName.length() < maxMetadataNameLength) {
                    if (sp.lowerNames()) {
                        final StringBuilder sb = new StringBuilder();
                        for (int i = 0; i < mName.length(); i++) {
                            char ch = mName.charAt(i);
                            if (!Character.isLetterOrDigit(ch)) {
                                ch = '_';
                            } else {
                                ch = Character.toLowerCase(ch);
                            }
                            sb.append(ch);
                        }
                        mName = sb.toString();
                    }
                    final String target = sp.getMapping(mName);
                    if (target != null) {
                        if (docCopy.getField(target) != null) {
                            final String[] persistentValues = docCopy.getFieldAsStrings(target);
                            values = ArrayUtils.addAll(persistentValues, values);
                        }
                        docCopy.addField(target, values);
                    } else {
                        if (sp.keepAllMetadata()) {
                            if (docCopy.getField(mName) != null) {
                                final String[] persistentValues = docCopy.getFieldAsStrings(mName);
                                values = ArrayUtils.addAll(persistentValues, values);
                            }
                            docCopy.addField(mName, values);
                        }
                    }
                }
            }
            if (truncated) {
                removeField(docCopy, "truncated");
                docCopy.addField("truncated", "true");
            } else {
                removeField(docCopy, "truncated");
                docCopy.addField("truncated", "false");
            }
            if (resources_limit) {
                removeField(docCopy, "resources_limit");
                docCopy.addField("resources_limit", "true");
            } else {
                removeField(docCopy, "resources_limit");
                docCopy.addField("resources_limit", "false");
            }
            // Send new document downstream
            final int sendDocumentResultCode = activities.sendDocument(documentURI, docCopy);
            if (sendDocumentResultCode == 0) {
                return tikaServerResultCode;
            } else {
                return sendDocumentResultCode;
            }
        } finally {
            // This is really important to close the input stream in a finally statement as it will wait that the input stream is fully read (or closed) by down pipeline
            is.close();
        }
    } finally {
        if (ds != null) {
            ds.close();
        }
    }
}
Also used : SocketException(java.net.SocketException) PortUnreachableException(java.net.PortUnreachableException) HttpEntity(org.apache.http.HttpEntity) HashMap(java.util.HashMap) ByteArrayOutputStream(java.io.ByteArrayOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) JsonFactory(com.fasterxml.jackson.core.JsonFactory) NoRouteToHostException(java.net.NoRouteToHostException) HttpPut(org.apache.http.client.methods.HttpPut) ServiceInterruption(org.apache.manifoldcf.agents.interfaces.ServiceInterruption) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) List(java.util.List) ArrayList(java.util.ArrayList) JsonToken(com.fasterxml.jackson.core.JsonToken) RepositoryDocument(org.apache.manifoldcf.agents.interfaces.RepositoryDocument) ConnectException(java.net.ConnectException) JsonParser(com.fasterxml.jackson.core.JsonParser) NoHttpResponseException(org.apache.http.NoHttpResponseException) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) BindException(java.net.BindException) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) URISyntaxException(java.net.URISyntaxException) NoHttpResponseException(org.apache.http.NoHttpResponseException) PortUnreachableException(java.net.PortUnreachableException) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) NoRouteToHostException(java.net.NoRouteToHostException) BindException(java.net.BindException) InterruptedIOException(java.io.InterruptedIOException) SocketException(java.net.SocketException) ParseException(org.json.simple.parser.ParseException) SocketTimeoutException(java.net.SocketTimeoutException) ConnectException(java.net.ConnectException) IOException(java.io.IOException) InputStreamEntity(org.apache.http.entity.InputStreamEntity) SocketTimeoutException(java.net.SocketTimeoutException) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

RepositoryDocument (org.apache.manifoldcf.agents.interfaces.RepositoryDocument)26 IOException (java.io.IOException)14 InputStream (java.io.InputStream)14 Date (java.util.Date)14 ManifoldCFException (org.apache.manifoldcf.core.interfaces.ManifoldCFException)12 InterruptedIOException (java.io.InterruptedIOException)11 ByteArrayInputStream (java.io.ByteArrayInputStream)7 DateFormat (java.text.DateFormat)6 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)5 List (java.util.List)5 Test (org.junit.Test)5 Specification (org.apache.manifoldcf.core.interfaces.Specification)4 SpecificationNode (org.apache.manifoldcf.core.interfaces.SpecificationNode)4 IExistingVersions (org.apache.manifoldcf.crawler.interfaces.IExistingVersions)4 IProcessActivity (org.apache.manifoldcf.crawler.interfaces.IProcessActivity)4 SimpleDateFormat (java.text.SimpleDateFormat)3 AlfrescoResponse (com.github.maoo.indexer.client.AlfrescoResponse)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2