Search in sources :

Example 11 with Page

use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.

the class ConfluenceRepositoryConnector method addSeedDocumentsForSpace.

/**
 * <p>
 * Add seed documents for a given optional space
 * </p>
 *
 * @throws ServiceInterruption
 * @throws ManifoldCFException
 */
private void addSeedDocumentsForSpace(final String space, final Optional<String> pageType, final ISeedingActivity activities, final ConfluenceSpecification confluenceSpec, final String lastSeedVersion, final long seedTime, final int jobMode) throws ManifoldCFException, ServiceInterruption {
    long lastStart = 0;
    final long defaultSize = 50;
    if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
        final String spaceDesc = "space with key " + space;
        Logging.connectors.debug(new MessageFormat("Starting from {0} and size {1} for {2}", Locale.ROOT).format(new Object[] { lastStart, defaultSize, spaceDesc }));
    }
    try {
        Boolean isLast = true;
        do {
            final ConfluenceResponse<Page> response = confluenceClient.getSpaceRootPages((int) lastStart, (int) defaultSize, space, pageType);
            // final ConfluenceResponse<Page> response = confluenceClient.getPages(
            // (int) lastStart, (int) defaultSize, space, pageType);
            int count = 0;
            for (final Page page : response.getResults()) {
                activities.addSeedDocument(page.getId());
                if (confluenceSpec.isProcessAttachments()) {
                    processSeedAttachments(page, activities);
                }
                count++;
            }
            if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
                Logging.connectors.debug(new MessageFormat("Fetched and added {0} seed documents", Locale.ROOT).format(new Object[] { new Integer(count) }));
            }
            lastStart += count;
            isLast = response.isLast();
            if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
                Logging.connectors.debug(new MessageFormat("New start {0} and size {1}", Locale.ROOT).format(new Object[] { lastStart, defaultSize }));
            }
        } while (!isLast);
    } catch (final Exception e) {
        handleConfluenceDownException(e, "seeding");
    }
}
Also used : MessageFormat(java.text.MessageFormat) JSONObject(org.json.simple.JSONObject) Page(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page) InterruptedIOException(java.io.InterruptedIOException) ParseException(org.json.simple.parser.ParseException) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) IOException(java.io.IOException)

Example 12 with Page

use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.

the class ConfluenceRepositoryConnector method getPageChilds.

private List<Page> getPageChilds(final String pageId) throws ManifoldCFException, ServiceInterruption {
    long lastStart = 0;
    final long defaultSize = 25;
    final List<Page> pageChilds = new ArrayList<>();
    if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
        Logging.connectors.debug(new MessageFormat("Starting from {0} and size {1} for {2}", Locale.ROOT).format(new Object[] { lastStart, defaultSize, "getPageChilds" }));
    }
    try {
        Boolean isLast = true;
        do {
            final ConfluenceResponse<Page> response = confluenceClient.getPageChilds((int) lastStart, (int) defaultSize, pageId);
            int count = 0;
            for (final Page page : response.getResults()) {
                pageChilds.add(page);
                count++;
            }
            lastStart += count;
            isLast = response.isLast();
            if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
                Logging.connectors.debug(new MessageFormat("New start {0} and size {1} for {2}", Locale.ROOT).format(new Object[] { lastStart, defaultSize, "getPageChilds" }));
            }
        } while (!isLast);
    } catch (final Exception e) {
        handleConfluenceDownException(e, "seeding");
    }
    return pageChilds;
}
Also used : MessageFormat(java.text.MessageFormat) ArrayList(java.util.ArrayList) Page(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page) JSONObject(org.json.simple.JSONObject) InterruptedIOException(java.io.InterruptedIOException) ParseException(org.json.simple.parser.ParseException) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) IOException(java.io.IOException)

Example 13 with Page

use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.

the class ConfluenceRepositoryConnector method processPageInternal.

/**
 * <p>
 * Process the specific page
 * </p>
 *
 * @param activeSecurity             Security enabled/disabled
 * @param parentRestrictions         The list of parent restrictions
 * @param page                       The page to process
 * @param manifoldDocumentIdentifier
 * @param version                    The version of the page
 * @param activities
 * @param doLog
 *
 * @throws ManifoldCFException
 * @throws IOException
 * @throws ServiceInterruption
 */
private ProcessResult processPageInternal(final boolean activeSecurity, final List<String> parentRestrictions, final Page page, final String manifoldDocumentIdentifier, final String version, final IProcessActivity activities, final boolean doLog, final Map<String, String> extraProperties) throws ManifoldCFException, ServiceInterruption, IOException {
    if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
        Logging.connectors.debug("Confluence: This content exists: " + page.getId());
    }
    final RepositoryDocument rd = new RepositoryDocument();
    final Date createdDate = page.getCreatedDate();
    final Date lastModified = page.getLastModifiedDate();
    final DateFormat df = DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.MEDIUM, Locale.ROOT);
    /*
     * Retain page in Manifold because it has not changed from last time This is needed to keep the identifier in Manifold data, because by default if a document is not retained nor
     * ingested, it will be deleted by the framework
     */
    final StringBuilder versionBuilder = new StringBuilder();
    versionBuilder.append(df.format(lastModified));
    final List<String> pageRestrictions = new ArrayList<String>();
    if (activeSecurity) {
        final List<Restrictions> restrictions = getPageReadRestrictions(page.getId());
        for (final Restrictions res : restrictions) {
            final ReadRestrictions rr = res.getReadRestrictions();
            rr.getUsers().forEach(user -> {
                pageRestrictions.add("user-" + user.getUserKey());
            });
            rr.getGroups().forEach(group -> {
                pageRestrictions.add("group-" + group.getName());
            });
        }
    }
    // Order the page restrictions alphabetically so the version will be always the same in case the same restrictions between two crawls are
    // not retrieved in the same order
    pageRestrictions.sort(String::compareToIgnoreCase);
    versionBuilder.append("+");
    packList(versionBuilder, pageRestrictions, '+');
    versionBuilder.append("+");
    packList(versionBuilder, parentRestrictions, '+');
    final String lastVersion = versionBuilder.toString();
    // Get and reference page direct childs if any
    if (page.getType() == PageType.PAGE) {
        final List<Page> pageChilds = getPageChilds(page.getId());
        for (final Page childPage : pageChilds) {
            final JSONObject child = new JSONObject();
            child.put("id", childPage.getId());
            final List<String> childParentRestrictions = new ArrayList<>();
            // its child pages
            if (activeSecurity) {
                if (pageRestrictions.isEmpty()) {
                    childParentRestrictions.addAll(parentRestrictions);
                } else {
                    childParentRestrictions.addAll(pageRestrictions);
                }
            }
            childParentRestrictions.sort(String::compareToIgnoreCase);
            child.put("parentRestricions", childParentRestrictions);
            activities.addDocumentReference(CHILD_PREFIX + child.toJSONString());
        }
    }
    if (!activities.checkDocumentNeedsReindexing(manifoldDocumentIdentifier, lastVersion)) {
        return new ProcessResult(page.getLength(), "RETAINED", "");
    }
    if (!activities.checkLengthIndexable(page.getLength())) {
        activities.noDocument(manifoldDocumentIdentifier, lastVersion);
        final String errorCode = IProcessActivity.EXCLUDED_LENGTH;
        final String errorDesc = "Excluding document because of length (" + page.getLength() + ")";
        return new ProcessResult(page.getLength(), errorCode, errorDesc);
    }
    if (!activities.checkMimeTypeIndexable(page.getMediaType())) {
        activities.noDocument(manifoldDocumentIdentifier, lastVersion);
        final String errorCode = IProcessActivity.EXCLUDED_MIMETYPE;
        final String errorDesc = "Excluding document because of mime type (" + page.getMediaType() + ")";
        return new ProcessResult(page.getLength(), errorCode, errorDesc);
    }
    if (!activities.checkDateIndexable(lastModified)) {
        activities.noDocument(manifoldDocumentIdentifier, lastVersion);
        final String errorCode = IProcessActivity.EXCLUDED_DATE;
        final String errorDesc = "Excluding document because of date (" + lastModified + ")";
        return new ProcessResult(page.getLength(), errorCode, errorDesc);
    }
    if (!activities.checkURLIndexable(page.getWebUrl())) {
        activities.noDocument(manifoldDocumentIdentifier, lastVersion);
        final String errorCode = IProcessActivity.EXCLUDED_URL;
        final String errorDesc = "Excluding document because of URL ('" + page.getWebUrl() + "')";
        return new ProcessResult(page.getLength(), errorCode, errorDesc);
    }
    /* Add repository document information */
    rd.setMimeType(page.getMediaType());
    if (createdDate != null) {
        rd.setCreatedDate(createdDate);
    }
    if (lastModified != null) {
        rd.setModifiedDate(lastModified);
    }
    rd.setIndexingDate(new Date());
    /* Adding Page Metadata */
    final Map<String, Object> pageMetadata = page.getMetadataAsMap();
    for (final Entry<String, Object> entry : pageMetadata.entrySet()) {
        if (entry.getValue() instanceof List) {
            final List<?> list = (List<?>) entry.getValue();
            rd.addField(entry.getKey(), list.toArray(new String[list.size()]));
        } else if (entry.getValue() != null) {
            final String key = entry.getKey();
            final String value = entry.getValue().toString();
            rd.addField(key, value);
            if (key.toLowerCase(Locale.ROOT).contentEquals("title")) {
                rd.addField("stream_name", value);
            }
        }
    }
    rd.addField("source", "confluence");
    /* Adding extra properties */
    for (final Entry<String, String> entry : extraProperties.entrySet()) {
        rd.addField(entry.getKey(), entry.getValue());
    }
    final String documentURI = page.getWebUrl();
    /* Set repository document ACLs */
    if (activeSecurity) {
        rd.setSecurity(RepositoryDocument.SECURITY_TYPE_SHARE, new String[] { "space-" + page.getSpace() }, new String[] { defaultAuthorityDenyToken });
        if (parentRestrictions.size() > 0) {
            rd.setSecurity(RepositoryDocument.SECURITY_TYPE_PARENT, parentRestrictions.toArray(new String[0]), new String[] { defaultAuthorityDenyToken });
        }
        if (pageRestrictions.size() > 0) {
            rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, pageRestrictions.toArray(new String[0]), new String[] { defaultAuthorityDenyToken });
        }
    }
    rd.setBinary(page.getContentStream(), page.getLength());
    rd.addField("size", String.valueOf(page.getLength()));
    rd.addField("url", documentURI);
    /* Ingest document */
    activities.ingestDocumentWithException(manifoldDocumentIdentifier, lastVersion, documentURI, rd);
    return new ProcessResult(page.getLength(), null, null);
}
Also used : ReadRestrictions(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Restrictions.ReadRestrictions) ArrayList(java.util.ArrayList) Page(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page) Date(java.util.Date) JSONObject(org.json.simple.JSONObject) DateFormat(java.text.DateFormat) JSONObject(org.json.simple.JSONObject) ArrayList(java.util.ArrayList) List(java.util.List) RepositoryDocument(org.apache.manifoldcf.agents.interfaces.RepositoryDocument) ReadRestrictions(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Restrictions.ReadRestrictions) Restrictions(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Restrictions)

Example 14 with Page

use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.

the class ConfluenceClient method getConfluenceResources.

/**
 * <p>
 * Get the {@code ConfluenceResources} from the given url
 * </p>
 *
 * @param url
 *          The url identifying the REST resource to get the documents
 * @param builder
 *          The builder used to build the resources contained in the response
 * @return a {@code ConfluenceResponse} containing the page results
 * @throws Exception
 */
private ConfluenceResponse<? extends ConfluenceResource> getConfluenceResources(final String url, final ConfluenceResourceBuilder<? extends ConfluenceResource> builder) throws Exception {
    logger.debug("[Processing] Hitting url for get confluence resources: {}", sanitizeUrl(url));
    final HttpGet httpGet = createGetRequest(url);
    try (CloseableHttpResponse response = executeRequest(httpGet)) {
        final ConfluenceResponse<? extends ConfluenceResource> confluenceResponse = responseFromHttpEntity(response.getEntity(), builder);
        EntityUtils.consume(response.getEntity());
        return confluenceResponse;
    } catch (final IOException e) {
        logger.error("[Processing] Failed to get page(s)", e);
        throw new Exception("Confluence appears to be down", e);
    }
}
Also used : HttpGet(org.apache.http.client.methods.HttpGet) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) IOException(java.io.IOException) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) ConfluenceException(org.apache.manifoldcf.crawler.connectors.confluence.v6.exception.ConfluenceException) IOException(java.io.IOException)

Example 15 with Page

use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.

the class ConfluenceClient method getAttachment.

/**
 * <p>
 * Gets a specific attachment contained in the specific page
 * </p>
 *
 * @param attachmentId
 * @return the {@code Attachment} instance
 * @throws Exception
 */
public Attachment getAttachment(final String attachmentId) throws Exception {
    final String url = String.format(Locale.ROOT, "%s://%s:%s%s%s/%s?%s", protocol, host, port, path, CONTENT_PATH, attachmentId, EXPANDABLE_PARAMETERS);
    logger.debug("[Processing] Hitting url for getting document content : {}", sanitizeUrl(url));
    final HttpGet httpGet = createGetRequest(url);
    try (CloseableHttpResponse response = executeRequest(httpGet)) {
        final HttpEntity entity = response.getEntity();
        final MutableAttachment attachment = attachmentFromHttpEntity(entity);
        EntityUtils.consume(entity);
        retrieveAndSetAttachmentContent(attachment);
        return attachment;
    } catch (final Exception e) {
        logger.error("[Processing] Failed to get attachment {}. Error: {}", url, e.getMessage());
        throw e;
    }
}
Also used : HttpEntity(org.apache.http.HttpEntity) HttpGet(org.apache.http.client.methods.HttpGet) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) MutableAttachment(org.apache.manifoldcf.crawler.connectors.confluence.v6.model.MutableAttachment) ManifoldCFException(org.apache.manifoldcf.core.interfaces.ManifoldCFException) ConfluenceException(org.apache.manifoldcf.crawler.connectors.confluence.v6.exception.ConfluenceException) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)14 Page (model.Page)9 ManifoldCFException (org.apache.manifoldcf.core.interfaces.ManifoldCFException)9 List (java.util.List)5 ServletException (javax.servlet.ServletException)5 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)5 ConfluenceException (org.apache.manifoldcf.crawler.connectors.confluence.v6.exception.ConfluenceException)5 JSONObject (org.json.simple.JSONObject)5 InterruptedIOException (java.io.InterruptedIOException)4 SQLException (java.sql.SQLException)4 HttpGet (org.apache.http.client.methods.HttpGet)4 Page (org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page)4 ParseException (org.json.simple.parser.ParseException)4 MessageFormat (java.text.MessageFormat)3 ArrayList (java.util.ArrayList)2 HttpEntity (org.apache.http.HttpEntity)2 Attachment (org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Attachment)2 MutablePage (org.apache.manifoldcf.crawler.connectors.confluence.v6.model.MutablePage)2 StringReader (java.io.StringReader)1 DateFormat (java.text.DateFormat)1