use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.
the class QueryBasedExtractor method removeDataPullUpperBounds.
/**
* Remove all upper bounds in the predicateList used for pulling data
*/
private void removeDataPullUpperBounds() {
log.info("Removing data pull upper bound for last work unit");
Iterator<Predicate> it = predicateList.iterator();
while (it.hasNext()) {
Predicate predicate = it.next();
if (predicate.getType() == Predicate.PredicateType.HWM) {
log.info("Remove predicate: " + predicate.condition);
it.remove();
}
}
}
use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.
the class SalesforceExtractor method getQueryResultIds.
/**
* Get Record set using salesforce specific API(Bulk API)
* @param entity/tablename
* @param predicateList of all predicate conditions
* @return iterator with batch of records
*/
private List<BatchIdAndResultId> getQueryResultIds(String entity, List<Predicate> predicateList) throws Exception {
if (!bulkApiLogin()) {
throw new IllegalArgumentException("Invalid Login");
}
try {
boolean usingPkChunking = false;
// Set bulk job attributes
this.bulkJob.setObject(entity);
this.bulkJob.setOperation(OperationEnum.query);
this.bulkJob.setConcurrencyMode(ConcurrencyMode.Parallel);
// use pk chunking if pk chunking is configured and the expected record count is larger than the pk chunking size
if (this.pkChunking && getExpectedRecordCount() > this.pkChunkingSize) {
log.info("Enabling pk chunking with size {}", this.pkChunkingSize);
this.bulkConnection.addHeader("Sforce-Enable-PKChunking", "chunkSize=" + this.pkChunkingSize);
usingPkChunking = true;
}
// Result type as CSV
this.bulkJob.setContentType(ContentType.CSV);
this.bulkJob = this.bulkConnection.createJob(this.bulkJob);
this.bulkJob = this.bulkConnection.getJobStatus(this.bulkJob.getId());
// Construct query with the predicates
String query = this.updatedQuery;
if (!isNullPredicate(predicateList)) {
String limitString = getLimitFromInputQuery(query);
query = query.replace(limitString, "");
Iterator<Predicate> i = predicateList.listIterator();
while (i.hasNext()) {
Predicate predicate = i.next();
query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
}
query = query + limitString;
}
log.info("QUERY:" + query);
ByteArrayInputStream bout = new ByteArrayInputStream(query.getBytes(ConfigurationKeys.DEFAULT_CHARSET_ENCODING));
BatchInfo bulkBatchInfo = this.bulkConnection.createBatchFromStream(this.bulkJob, bout);
long expectedSizePerBatch = usingPkChunking ? this.pkChunkingSize : this.getExpectedRecordCount();
int retryInterval = Math.min(MAX_RETRY_INTERVAL_SECS, 30 + (int) Math.ceil((float) expectedSizePerBatch / 10000) * 2);
log.info("Salesforce bulk api retry interval in seconds:" + retryInterval);
// Get batch info with complete resultset (info id - refers to the resultset id corresponding to entire resultset)
bulkBatchInfo = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bulkBatchInfo.getId());
// wait for completion, failure, or formation of PK chunking batches
while ((bulkBatchInfo.getState() != BatchStateEnum.Completed) && (bulkBatchInfo.getState() != BatchStateEnum.Failed) && (!usingPkChunking || bulkBatchInfo.getState() != BatchStateEnum.NotProcessed)) {
Thread.sleep(retryInterval * 1000);
bulkBatchInfo = this.bulkConnection.getBatchInfo(this.bulkJob.getId(), bulkBatchInfo.getId());
log.debug("Bulk Api Batch Info:" + bulkBatchInfo);
log.info("Waiting for bulk resultSetIds");
}
// Wait for pk chunking batches
BatchInfoList batchInfoList = this.bulkConnection.getBatchInfoList(this.bulkJob.getId());
if (usingPkChunking && bulkBatchInfo.getState() == BatchStateEnum.NotProcessed) {
bulkBatchInfo = waitForPkBatches(batchInfoList, retryInterval);
}
if (bulkBatchInfo.getState() == BatchStateEnum.Failed) {
log.error("Bulk batch failed: " + bulkBatchInfo.toString());
throw new RuntimeException("Failed to get bulk batch info for jobId " + bulkBatchInfo.getJobId() + " error - " + bulkBatchInfo.getStateMessage());
}
// Get resultset ids of all the batches from the batch info list
List<BatchIdAndResultId> batchIdAndResultIdList = Lists.newArrayList();
for (BatchInfo bi : batchInfoList.getBatchInfo()) {
QueryResultList list = this.bulkConnection.getQueryResultList(this.bulkJob.getId(), bi.getId());
for (String result : list.getResult()) {
batchIdAndResultIdList.add(new BatchIdAndResultId(bi.getId(), result));
}
}
log.info("QueryResultList: " + batchIdAndResultIdList);
return batchIdAndResultIdList;
} catch (RuntimeException | AsyncApiException | InterruptedException e) {
throw new RuntimeException("Failed to get query result ids from salesforce using bulk api; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.
the class SalesforceExtractor method getCountMetadata.
@Override
public List<Command> getCountMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws RecordCountException {
log.debug("Build url to retrieve source record count");
String existingPredicate = "";
if (this.updatedQuery != null) {
String queryLowerCase = this.updatedQuery.toLowerCase();
int startIndex = queryLowerCase.indexOf(" where ");
if (startIndex > 0) {
existingPredicate = this.updatedQuery.substring(startIndex);
}
}
String query = "SELECT COUNT() FROM " + entity + existingPredicate;
String limitString = getLimitFromInputQuery(query);
query = query.replace(limitString, "");
try {
if (isNullPredicate(predicateList)) {
log.info("QUERY with null predicate: " + query);
return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
}
Iterator<Predicate> i = predicateList.listIterator();
while (i.hasNext()) {
Predicate predicate = i.next();
query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
}
query = query + getLimitFromInputQuery(this.updatedQuery);
log.info("QUERY: " + query);
return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
} catch (Exception e) {
throw new RecordCountException("Failed to get salesforce url for record count; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.
the class SalesforceExtractor method getDataMetadata.
@Override
public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException {
log.debug("Build url to retrieve data records");
String query = this.updatedQuery;
String url = null;
try {
if (this.getNextUrl() != null && this.pullStatus == true) {
url = this.getNextUrl();
} else {
if (isNullPredicate(predicateList)) {
log.info("QUERY:" + query);
return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
}
String limitString = getLimitFromInputQuery(query);
query = query.replace(limitString, "");
Iterator<Predicate> i = predicateList.listIterator();
while (i.hasNext()) {
Predicate predicate = i.next();
query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
}
if (Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_SPECIFIC_API_ACTIVE))) {
query = SqlQueryUtils.addPredicate(query, "IsDeleted = true");
}
query = query + limitString;
log.info("QUERY: " + query);
url = this.sfConnector.getFullUri(getSoqlUrl(query));
}
return constructGetCommand(url);
} catch (Exception e) {
throw new DataRecordException("Failed to get salesforce url for data records; error - " + e.getMessage(), e);
}
}
use of org.apache.gobblin.source.extractor.watermark.Predicate in project incubator-gobblin by apache.
the class SalesforceExtractor method getHighWatermarkMetadata.
@Override
public List<Command> getHighWatermarkMetadata(String schema, String entity, String watermarkColumn, List<Predicate> predicateList) throws HighWatermarkException {
log.debug("Build url to retrieve high watermark");
String query = "SELECT " + watermarkColumn + " FROM " + entity;
String defaultPredicate = " " + watermarkColumn + " != null";
String defaultSortOrder = " ORDER BY " + watermarkColumn + " desc LIMIT 1";
String existingPredicate = "";
if (this.updatedQuery != null) {
String queryLowerCase = this.updatedQuery.toLowerCase();
int startIndex = queryLowerCase.indexOf(" where ");
if (startIndex > 0) {
existingPredicate = this.updatedQuery.substring(startIndex);
}
}
query = query + existingPredicate;
String limitString = getLimitFromInputQuery(query);
query = query.replace(limitString, "");
Iterator<Predicate> i = predicateList.listIterator();
while (i.hasNext()) {
Predicate predicate = i.next();
query = SqlQueryUtils.addPredicate(query, predicate.getCondition());
}
query = SqlQueryUtils.addPredicate(query, defaultPredicate);
query = query + defaultSortOrder;
log.info("QUERY: " + query);
try {
return constructGetCommand(this.sfConnector.getFullUri(getSoqlUrl(query)));
} catch (Exception e) {
throw new HighWatermarkException("Failed to get salesforce url for high watermark; error - " + e.getMessage(), e);
}
}
Aggregations