Search in sources :

Example 6 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class Partitioner method getGlobalPartition.

/**
 * Get the global partition of the whole data set, which has the global low and high watermarks
 *
 * @param previousWatermark previous watermark for computing the low watermark of current run
 * @return a Partition instance
 */
public Partition getGlobalPartition(long previousWatermark) {
    ExtractType extractType = ExtractType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase());
    WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
    WatermarkPredicate watermark = new WatermarkPredicate(null, watermarkType);
    int deltaForNextWatermark = watermark.getDeltaNumForNextWatermark();
    long lowWatermark = getLowWatermark(extractType, watermarkType, previousWatermark, deltaForNextWatermark);
    long highWatermark = getHighWatermark(extractType, watermarkType);
    return new Partition(lowWatermark, highWatermark, true, hasUserSpecifiedHighWatermark);
}
Also used : WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) WatermarkPredicate(org.apache.gobblin.source.extractor.watermark.WatermarkPredicate) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType)

Example 7 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class QueryBasedExtractor method build.

/**
 * build schema, record count and high water mark
 */
public Extractor<S, D> build() throws ExtractPrepareException {
    String watermarkColumn = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
    long lwm = partition.getLowWatermark();
    long hwm = partition.getHighWatermark();
    log.info("Low water mark: " + lwm + "; and High water mark: " + hwm);
    WatermarkType watermarkType;
    if (StringUtils.isBlank(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) {
        watermarkType = null;
    } else {
        watermarkType = WatermarkType.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).toUpperCase());
    }
    log.info("Source Entity is " + this.entity);
    try {
        this.setTimeOut(this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_TIMEOUT, ConfigurationKeys.DEFAULT_CONN_TIMEOUT));
        this.extractMetadata(this.schema, this.entity, this.workUnit);
        if (StringUtils.isNotBlank(watermarkColumn)) {
            if (partition.isLastPartition()) {
                // Get a more accurate high watermark from the source
                long adjustedHighWatermark = this.getLatestWatermark(watermarkColumn, watermarkType, lwm, hwm);
                log.info("High water mark from source: " + adjustedHighWatermark);
                // Else, consider the low watermark as high water mark(with no delta).i.e, don't move the pointer
                if (adjustedHighWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
                    adjustedHighWatermark = getLowWatermarkWithNoDelta(lwm);
                }
                this.highWatermark = adjustedHighWatermark;
            } else {
                this.highWatermark = hwm;
            }
            log.info("High water mark for the current run: " + highWatermark);
            this.setRangePredicates(watermarkColumn, watermarkType, lwm, highWatermark);
        }
        // if it is set to true, skip count calculation and set source count to -1
        if (!Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SKIP_COUNT_CALC))) {
            this.sourceRecordCount = this.getSourceCount(this.schema, this.entity, this.workUnit, this.predicateList);
        } else {
            log.info("Skip count calculation");
            this.sourceRecordCount = -1;
        }
        if (this.sourceRecordCount == 0) {
            log.info("Record count is 0; Setting fetch status to false to skip readRecord()");
            this.setFetchStatus(false);
        }
    } catch (SchemaException e) {
        throw new ExtractPrepareException("Failed to get schema for this object; error - " + e.getMessage(), e);
    } catch (HighWatermarkException e) {
        throw new ExtractPrepareException("Failed to get high watermark; error - " + e.getMessage(), e);
    } catch (RecordCountException e) {
        throw new ExtractPrepareException("Failed to get record count; error - " + e.getMessage(), e);
    } catch (Exception e) {
        throw new ExtractPrepareException("Failed to prepare the extract build; error - " + e.getMessage(), e);
    }
    return this;
}
Also used : SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) ExtractPrepareException(org.apache.gobblin.source.extractor.exception.ExtractPrepareException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) SchemaException(org.apache.gobblin.source.extractor.exception.SchemaException) IOException(java.io.IOException) HighWatermarkException(org.apache.gobblin.source.extractor.exception.HighWatermarkException) RecordCountException(org.apache.gobblin.source.extractor.exception.RecordCountException) ExtractPrepareException(org.apache.gobblin.source.extractor.exception.ExtractPrepareException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException)

Aggregations

WatermarkType (org.apache.gobblin.source.extractor.watermark.WatermarkType)7 ExtractType (org.apache.gobblin.source.extractor.extract.ExtractType)3 WatermarkPredicate (org.apache.gobblin.source.extractor.watermark.WatermarkPredicate)3 JsonObject (com.google.gson.JsonObject)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 DataRecordException (org.apache.gobblin.source.extractor.DataRecordException)1 ExtractPrepareException (org.apache.gobblin.source.extractor.exception.ExtractPrepareException)1 HighWatermarkException (org.apache.gobblin.source.extractor.exception.HighWatermarkException)1 RecordCountException (org.apache.gobblin.source.extractor.exception.RecordCountException)1 SchemaException (org.apache.gobblin.source.extractor.exception.SchemaException)1 Partition (org.apache.gobblin.source.extractor.partition.Partition)1 Partitioner (org.apache.gobblin.source.extractor.partition.Partitioner)1 Schema (org.apache.gobblin.source.extractor.schema.Schema)1