Search in sources :

Example 1 with Predicate

use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.

the class QueryBasedExtractor method removeDataPullUpperBounds.

/**
 * Remove all upper bounds in the predicateList used for pulling data
 */
private void removeDataPullUpperBounds() {
    log.info("Removing data pull upper bound for last work unit");
    Iterator<Predicate> it = predicateList.iterator();
    while (it.hasNext()) {
        Predicate predicate = it.next();
        if (predicate.getType() == Predicate.PredicateType.HWM) {
            log.info("Remove predicate: " + predicate.condition);
            it.remove();
        }
    }
}
Also used : WatermarkPredicate(org.apache.gobblin.source.extractor.watermark.WatermarkPredicate) Predicate(org.apache.gobblin.source.extractor.watermark.Predicate)

Example 2 with Predicate

use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.

the class SalesforceExtractor method getQueryResultIds.

/**
 * Get Record set using salesforce specific API(Bulk API)
 * @param entity/tablename
 * @param predicateList of all predicate conditions
 * @return iterator with batch of records
 */
private List<BatchIdAndResultId> getQueryResultIds(String entity, List<Predicate> predicateList) throws Exception {
    if (!bulkApiLogin()) {
        throw new IllegalArgumentException("Invalid Login");
    }
    try {
        boolean usingPkChunking = false;
        // Set bulk job attributes
        this.bulkJob.setObject(entity);
        this.bulkJob.setOperation(OperationEnum.query);
        this.bulkJob.setConcurrencyMode(ConcurrencyMode.Parallel);
        // use pk chunking if pk chunking is configured and the expected record count is larger than the pk chunking size
        if (this.pkChunking && getExpectedRecordCount() > this.pkChunkingSize) {
            log.info("Enabling pk chunking with size {}", this.pkChunkingSize);
            this.bulkConnection.addHeader("Sforce-Enable-PKChunking", "chunkSize=" + this.pkChunkingSize);
            usingPkChunking = true;
        }
        // Result type as CSV
        this.bulkJob.setContentType(ContentType.CSV);
        this.bulkJob = this.bulkConnection.createJob(this.bulkJob);
        this.bulkJob = this.bulkConnection.getJobStatus(this.bulkJob.getId());
        // Construct query with the predicates
        String query = this.updatedQuery;
        if (!isNullPredicate(predicateList)) {
            String limitString = getLimitFromInputQuery(query);
            query = query.replace(limitString, "");
            Iterator<Predicate> i = predicateList.listIterator();
            while (i.hasNext()) {
                Predicate predicate = i.next();
                query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
            }
            query = query + limitString;
        }
        log.info("QUERY:" + query);
        ByteArrayInputStream bout = new ByteArrayInputStream(query.getBytes(ConfigurationKeys.DEFAULT_CHARSET_ENCODING));
        BatchInfo bulkBatchInfo = this.bulkConnection.createBatchFromStream(this.bulkJob, bout);
        long expectedSizePerBatch = usingPkChunking ? this.pkChunkingSize : this.getExpectedRecordCount();
        int retryInterval = Math.min(MAX_RETRY_INTERVAL_SECS, 30 + (int) Math.ceil((float) expectedSizePerBatch / 10000) * 2);
        log.info("Salesforce bulk api retry interval in seconds:" + retryInterval);
        // Get batch info with complete resultset (info id - refers to the resultset id corresponding to entire resultset)
        bulkBatchInfo = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bulkBatchInfo.getId());
        // wait for completion, failure, or formation of PK chunking batches
        while ((bulkBatchInfo.getState() != BatchStateEnum.Completed) && (bulkBatchInfo.getState() != BatchStateEnum.Failed) && (!usingPkChunking || bulkBatchInfo.getState() != BatchStateEnum.NotProcessed)) {
            Thread.sleep(retryInterval * 1000);
            bulkBatchInfo = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bulkBatchInfo.getId());
            log.debug("Bulk Api Batch Info:" + bulkBatchInfo);
            log.info("Waiting for bulk resultSetIds");
        }
        // Wait for pk chunking batches
        BatchInfoList batchInfoList = this.bulkConnection.getBatchInfoList(this.bulkJob.getId());
        if (usingPkChunking && bulkBatchInfo.getState() == BatchStateEnum.NotProcessed) {
            bulkBatchInfo = waitForPkBatches(batchInfoList, retryInterval);
        }
        if (bulkBatchInfo.getState() == BatchStateEnum.Failed) {
            log.error("Bulk batch failed: " + bulkBatchInfo.toString());
            throw new RuntimeException("Failed to get bulk batch info for jobId " + bulkBatchInfo.getJobId() + " error - " + bulkBatchInfo.getStateMessage());
        }
        // Get resultset ids of all the batches from the batch info list
        List<BatchIdAndResultId> batchIdAndResultIdList = Lists.newArrayList();
        for (BatchInfo bi : batchInfoList.getBatchInfo()) {
            QueryResultList list = this.bulkConnection.getQueryResultList(this.bulkJob.getId(), bi.getId());
            for (String result : list.getResult()) {
                batchIdAndResultIdList.add(new BatchIdAndResultId(bi.getId(), result));
            }
        }
        log.info("QueryResultList: " + batchIdAndResultIdList);
        return batchIdAndResultIdList;
    } catch (RuntimeException | AsyncApiException | InterruptedException e) {
        throw new RuntimeException("Failed to get query result ids from salesforce using bulk api; error - " + e.getMessage(), e);
    }
}
Also used : QueryResultList(com.sforce.async.QueryResultList) AsyncApiException(com.sforce.async.AsyncApiException) Predicate(org.apache.gobblin.source.extractor.watermark.Predicate) ByteArrayInputStream(java.io.ByteArrayInputStream) BatchInfoList(com.sforce.async.BatchInfoList) BatchInfo(com.sforce.async.BatchInfo)

Example 3 with Predicate

use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.

the class SalesforceExtractor method getCountMetadata.

@Override
public List<Command> getCountMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws RecordCountException {
    log.debug("Build url to retrieve source record count");
    String existingPredicate = "";
    if (this.updatedQuery != null) {
        String queryLowerCase = this.updatedQuery.toLowerCase();
        int startIndex = queryLowerCase.indexOf(" where ");
        if (startIndex > 0) {
            existingPredicate = this.updatedQuery.substring(startIndex);
        }
    }
    String query = "SELECT COUNT() FROM " + entity + existingPredicate;
    String limitString = getLimitFromInputQuery(query);
    query = query.replace(limitString, "");
    try {
        if (isNullPredicate(predicateList)) {
            log.info("QUERY with null predicate: " + query);
            return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
        }
        Iterator<Predicate> i = predicateList.listIterator();
        while (i.hasNext()) {
            Predicate predicate = i.next();
            query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
        }
        query = query + getLimitFromInputQuery(this.updatedQuery);
        log.info("QUERY: " + query);
        return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
    } catch (Exception e) {
        throw new RecordCountException("Failed to get salesforce url for record count; error - " + e.getMessage(), e);
    }
}
Also used : RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) ParseException(java.text.ParseException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) AsyncApiException(com.sforce.async.AsyncApiException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) RestApiClientException(org.apache.gobblin.source.extractor.exception.RestApiClientException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) RestApiConnectionException(org.apache.gobblin.source.extractor.exception.RestApiConnectionException) Predicate(org.apache.gobblin.source.extractor.watermark.Predicate)

Example 4 with Predicate

use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.

the class SalesforceExtractor method getDataMetadata.

@Override
public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException {
    log.debug("Build url to retrieve data records");
    String query = this.updatedQuery;
    String url = null;
    try {
        if (this.getNextUrl() != null && this.pullStatus == true) {
            url = this.getNextUrl();
        } else {
            if (isNullPredicate(predicateList)) {
                log.info("QUERY:" + query);
                return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
            }
            String limitString = getLimitFromInputQuery(query);
            query = query.replace(limitString, "");
            Iterator<Predicate> i = predicateList.listIterator();
            while (i.hasNext()) {
                Predicate predicate = i.next();
                query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
            }
            if (Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_SPECIFIC_API_ACTIVE))) {
                query = SqlQueryUtils.addPredicate(query, "IsDeleted = true");
            }
            query = query + limitString;
            log.info("QUERY: " + query);
            url = this.sfConnector.getFullUri(getSoqlUrl(query));
        }
        return constructGetCommand(url);
    } catch (Exception e) {
        throw new DataRecordException("Failed to get salesforce url for data records; error - " + e.getMessage(), e);
    }
}
Also used : ParseException(java.text.ParseException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) AsyncApiException(com.sforce.async.AsyncApiException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) RestApiClientException(org.apache.gobblin.source.extractor.exception.RestApiClientException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) RestApiConnectionException(org.apache.gobblin.source.extractor.exception.RestApiConnectionException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) Predicate(org.apache.gobblin.source.extractor.watermark.Predicate)

Example 5 with Predicate

use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.

the class SalesforceExtractor method getHighWatermarkMetadata.

@Override
public List<Command> getHighWatermarkMetadata(String schema, String entity, String watermarkColumn, List<Predicate> predicateList) throws HighWatermarkException {
    log.debug("Build url to retrieve high watermark");
    String query = "SELECT " + watermarkColumn + " FROM " + entity;
    String defaultPredicate = " " + watermarkColumn + " != null";
    String defaultSortOrder = " ORDER BY " + watermarkColumn + " desc LIMIT 1";
    String existingPredicate = "";
    if (this.updatedQuery != null) {
        String queryLowerCase = this.updatedQuery.toLowerCase();
        int startIndex = queryLowerCase.indexOf(" where ");
        if (startIndex > 0) {
            existingPredicate = this.updatedQuery.substring(startIndex);
        }
    }
    query = query + existingPredicate;
    String limitString = getLimitFromInputQuery(query);
    query = query.replace(limitString, "");
    Iterator<Predicate> i = predicateList.listIterator();
    while (i.hasNext()) {
        Predicate predicate = i.next();
        query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
    }
    query = SqlQueryUtils.addPredicate(query, defaultPredicate);
    query = query + defaultSortOrder;
    log.info("QUERY: " + query);
    try {
        return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
    } catch (Exception e) {
        throw new HighWatermarkException("Failed to get salesforce url for high watermark; error - " + e.getMessage(), e);
    }
}
Also used : HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) ParseException(java.text.ParseException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) AsyncApiException(com.sforce.async.AsyncApiException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) RestApiClientException(org.apache.gobblin.source.extractor.exception.RestApiClientException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) RestApiConnectionException(org.apache.gobblin.source.extractor.exception.RestApiConnectionException) Predicate(org.apache.gobblin.source.extractor.watermark.Predicate)

Aggregations

Predicate (org.apache.gobblin.source.extractor.watermark.Predicate)6 AsyncApiException (com.sforce.async.AsyncApiException)4 IOException (java.io.IOException)3 ParseException (java.text.ParseException)3 DataRecordException (org.apache.gobblin.source.extractor.DataRecordException)3 HighWatermarkException (org.apache.gobblin.source.extractor.exception.HighWatermarkException)3 RecordCountException (org.apache.gobblin.source.extractor.exception.RecordCountException)3 RestApiClientException (org.apache.gobblin.source.extractor.exception.RestApiClientException)3 RestApiConnectionException (org.apache.gobblin.source.extractor.exception.RestApiConnectionException)3 SchemaException (org.apache.gobblin.source.extractor.exception.SchemaException)3 WatermarkPredicate (org.apache.gobblin.source.extractor.watermark.WatermarkPredicate)2 BatchInfo (com.sforce.async.BatchInfo)1 BatchInfoList (com.sforce.async.BatchInfoList)1 QueryResultList (com.sforce.async.QueryResultList)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ArrayList (java.util.ArrayList)1