use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.
the class ConfluenceRepositoryConnector method addSeedDocumentsForSpace.
/**
* <p>
* Add seed documents for a given optional space
* </p>
*
* @throws ServiceInterruption
* @throws ManifoldCFException
*/
private void addSeedDocumentsForSpace(final String space, final Optional<String> pageType, final ISeedingActivity activities, final ConfluenceSpecification confluenceSpec, final String lastSeedVersion, final long seedTime, final int jobMode) throws ManifoldCFException, ServiceInterruption {
long lastStart = 0;
final long defaultSize = 50;
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
final String spaceDesc = "space with key " + space;
Logging.connectors.debug(new MessageFormat("Starting from {0} and size {1} for {2}", Locale.ROOT).format(new Object[] { lastStart, defaultSize, spaceDesc }));
}
try {
Boolean isLast = true;
do {
final ConfluenceResponse<Page> response = confluenceClient.getSpaceRootPages((int) lastStart, (int) defaultSize, space, pageType);
// final ConfluenceResponse<Page> response = confluenceClient.getPages(
// (int) lastStart, (int) defaultSize, space, pageType);
int count = 0;
for (final Page page : response.getResults()) {
activities.addSeedDocument(page.getId());
if (confluenceSpec.isProcessAttachments()) {
processSeedAttachments(page, activities);
}
count++;
}
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug(new MessageFormat("Fetched and added {0} seed documents", Locale.ROOT).format(new Object[] { new Integer(count) }));
}
lastStart += count;
isLast = response.isLast();
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug(new MessageFormat("New start {0} and size {1}", Locale.ROOT).format(new Object[] { lastStart, defaultSize }));
}
} while (!isLast);
} catch (final Exception e) {
handleConfluenceDownException(e, "seeding");
}
}
use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.
the class ConfluenceRepositoryConnector method getPageChilds.
private List<Page> getPageChilds(final String pageId) throws ManifoldCFException, ServiceInterruption {
long lastStart = 0;
final long defaultSize = 25;
final List<Page> pageChilds = new ArrayList<>();
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug(new MessageFormat("Starting from {0} and size {1} for {2}", Locale.ROOT).format(new Object[] { lastStart, defaultSize, "getPageChilds" }));
}
try {
Boolean isLast = true;
do {
final ConfluenceResponse<Page> response = confluenceClient.getPageChilds((int) lastStart, (int) defaultSize, pageId);
int count = 0;
for (final Page page : response.getResults()) {
pageChilds.add(page);
count++;
}
lastStart += count;
isLast = response.isLast();
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug(new MessageFormat("New start {0} and size {1} for {2}", Locale.ROOT).format(new Object[] { lastStart, defaultSize, "getPageChilds" }));
}
} while (!isLast);
} catch (final Exception e) {
handleConfluenceDownException(e, "seeding");
}
return pageChilds;
}
use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.
the class ConfluenceRepositoryConnector method processPageInternal.
/**
* <p>
* Process the specific page
* </p>
*
* @param activeSecurity Security enabled/disabled
* @param parentRestrictions The list of parent restrictions
* @param page The page to process
* @param manifoldDocumentIdentifier
* @param version The version of the page
* @param activities
* @param doLog
*
* @throws ManifoldCFException
* @throws IOException
* @throws ServiceInterruption
*/
private ProcessResult processPageInternal(final boolean activeSecurity, final List<String> parentRestrictions, final Page page, final String manifoldDocumentIdentifier, final String version, final IProcessActivity activities, final boolean doLog, final Map<String, String> extraProperties) throws ManifoldCFException, ServiceInterruption, IOException {
if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("Confluence: This content exists: " + page.getId());
}
final RepositoryDocument rd = new RepositoryDocument();
final Date createdDate = page.getCreatedDate();
final Date lastModified = page.getLastModifiedDate();
final DateFormat df = DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.MEDIUM, Locale.ROOT);
/*
* Retain page in Manifold because it has not changed from last time This is needed to keep the identifier in Manifold data, because by default if a document is not retained nor
* ingested, it will be deleted by the framework
*/
final StringBuilder versionBuilder = new StringBuilder();
versionBuilder.append(df.format(lastModified));
final List<String> pageRestrictions = new ArrayList<String>();
if (activeSecurity) {
final List<Restrictions> restrictions = getPageReadRestrictions(page.getId());
for (final Restrictions res : restrictions) {
final ReadRestrictions rr = res.getReadRestrictions();
rr.getUsers().forEach(user -> {
pageRestrictions.add("user-" + user.getUserKey());
});
rr.getGroups().forEach(group -> {
pageRestrictions.add("group-" + group.getName());
});
}
}
// Order the page restrictions alphabetically so the version will be always the same in case the same restrictions between two crawls are
// not retrieved in the same order
pageRestrictions.sort(String::compareToIgnoreCase);
versionBuilder.append("+");
packList(versionBuilder, pageRestrictions, '+');
versionBuilder.append("+");
packList(versionBuilder, parentRestrictions, '+');
final String lastVersion = versionBuilder.toString();
// Get and reference page direct childs if any
if (page.getType() == PageType.PAGE) {
final List<Page> pageChilds = getPageChilds(page.getId());
for (final Page childPage : pageChilds) {
final JSONObject child = new JSONObject();
child.put("id", childPage.getId());
final List<String> childParentRestrictions = new ArrayList<>();
// its child pages
if (activeSecurity) {
if (pageRestrictions.isEmpty()) {
childParentRestrictions.addAll(parentRestrictions);
} else {
childParentRestrictions.addAll(pageRestrictions);
}
}
childParentRestrictions.sort(String::compareToIgnoreCase);
child.put("parentRestricions", childParentRestrictions);
activities.addDocumentReference(CHILD_PREFIX + child.toJSONString());
}
}
if (!activities.checkDocumentNeedsReindexing(manifoldDocumentIdentifier, lastVersion)) {
return new ProcessResult(page.getLength(), "RETAINED", "");
}
if (!activities.checkLengthIndexable(page.getLength())) {
activities.noDocument(manifoldDocumentIdentifier, lastVersion);
final String errorCode = IProcessActivity.EXCLUDED_LENGTH;
final String errorDesc = "Excluding document because of length (" + page.getLength() + ")";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
if (!activities.checkMimeTypeIndexable(page.getMediaType())) {
activities.noDocument(manifoldDocumentIdentifier, lastVersion);
final String errorCode = IProcessActivity.EXCLUDED_MIMETYPE;
final String errorDesc = "Excluding document because of mime type (" + page.getMediaType() + ")";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
if (!activities.checkDateIndexable(lastModified)) {
activities.noDocument(manifoldDocumentIdentifier, lastVersion);
final String errorCode = IProcessActivity.EXCLUDED_DATE;
final String errorDesc = "Excluding document because of date (" + lastModified + ")";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
if (!activities.checkURLIndexable(page.getWebUrl())) {
activities.noDocument(manifoldDocumentIdentifier, lastVersion);
final String errorCode = IProcessActivity.EXCLUDED_URL;
final String errorDesc = "Excluding document because of URL ('" + page.getWebUrl() + "')";
return new ProcessResult(page.getLength(), errorCode, errorDesc);
}
/* Add repository document information */
rd.setMimeType(page.getMediaType());
if (createdDate != null) {
rd.setCreatedDate(createdDate);
}
if (lastModified != null) {
rd.setModifiedDate(lastModified);
}
rd.setIndexingDate(new Date());
/* Adding Page Metadata */
final Map<String, Object> pageMetadata = page.getMetadataAsMap();
for (final Entry<String, Object> entry : pageMetadata.entrySet()) {
if (entry.getValue() instanceof List) {
final List<?> list = (List<?>) entry.getValue();
rd.addField(entry.getKey(), list.toArray(new String[list.size()]));
} else if (entry.getValue() != null) {
final String key = entry.getKey();
final String value = entry.getValue().toString();
rd.addField(key, value);
if (key.toLowerCase(Locale.ROOT).contentEquals("title")) {
rd.addField("stream_name", value);
}
}
}
rd.addField("source", "confluence");
/* Adding extra properties */
for (final Entry<String, String> entry : extraProperties.entrySet()) {
rd.addField(entry.getKey(), entry.getValue());
}
final String documentURI = page.getWebUrl();
/* Set repository document ACLs */
if (activeSecurity) {
rd.setSecurity(RepositoryDocument.SECURITY_TYPE_SHARE, new String[] { "space-" + page.getSpace() }, new String[] { defaultAuthorityDenyToken });
if (parentRestrictions.size() > 0) {
rd.setSecurity(RepositoryDocument.SECURITY_TYPE_PARENT, parentRestrictions.toArray(new String[0]), new String[] { defaultAuthorityDenyToken });
}
if (pageRestrictions.size() > 0) {
rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, pageRestrictions.toArray(new String[0]), new String[] { defaultAuthorityDenyToken });
}
}
rd.setBinary(page.getContentStream(), page.getLength());
rd.addField("size", String.valueOf(page.getLength()));
rd.addField("url", documentURI);
/* Ingest document */
activities.ingestDocumentWithException(manifoldDocumentIdentifier, lastVersion, documentURI, rd);
return new ProcessResult(page.getLength(), null, null);
}
use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.
the class ConfluenceClient method getConfluenceResources.
/**
* <p>
* Get the {@code ConfluenceResources} from the given url
* </p>
*
* @param url
* The url identifying the REST resource to get the documents
* @param builder
* The builder used to build the resources contained in the response
* @return a {@code ConfluenceResponse} containing the page results
* @throws Exception
*/
private ConfluenceResponse<? extends ConfluenceResource> getConfluenceResources(final String url, final ConfluenceResourceBuilder<? extends ConfluenceResource> builder) throws Exception {
logger.debug("[Processing] Hitting url for get confluence resources: {}", sanitizeUrl(url));
final HttpGet httpGet = createGetRequest(url);
try (CloseableHttpResponse response = executeRequest(httpGet)) {
final ConfluenceResponse<? extends ConfluenceResource> confluenceResponse = responseFromHttpEntity(response.getEntity(), builder);
EntityUtils.consume(response.getEntity());
return confluenceResponse;
} catch (final IOException e) {
logger.error("[Processing] Failed to get page(s)", e);
throw new Exception("Confluence appears to be down", e);
}
}
use of org.apache.manifoldcf.crawler.connectors.confluence.v6.model.Page in project manifoldcf by apache.
the class ConfluenceClient method getAttachment.
/**
* <p>
* Gets a specific attachment contained in the specific page
* </p>
*
* @param attachmentId
* @return the {@code Attachment} instance
* @throws Exception
*/
public Attachment getAttachment(final String attachmentId) throws Exception {
final String url = String.format(Locale.ROOT, "%s://%s:%s%s%s/%s?%s", protocol, host, port, path, CONTENT_PATH, attachmentId, EXPANDABLE_PARAMETERS);
logger.debug("[Processing] Hitting url for getting document content : {}", sanitizeUrl(url));
final HttpGet httpGet = createGetRequest(url);
try (CloseableHttpResponse response = executeRequest(httpGet)) {
final HttpEntity entity = response.getEntity();
final MutableAttachment attachment = attachmentFromHttpEntity(entity);
EntityUtils.consume(entity);
retrieveAndSetAttachmentContent(attachment);
return attachment;
} catch (final Exception e) {
logger.error("[Processing] Failed to get attachment {}. Error: {}", url, e.getMessage());
throw e;
}
}
Aggregations