Search in sources :

Example 1 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class QueryBasedExtractor method getLowWatermarkWithNoDelta.

private long getLowWatermarkWithNoDelta(long lwm) {
    if (lwm == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
        return ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
    }
    String watermarkType = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "TIMESTAMP");
    WatermarkType wmType = WatermarkType.valueOf(watermarkType.toUpperCase());
    int deltaNum = new WatermarkPredicate(wmType).getDeltaNumForNextWatermark();
    switch(wmType) {
        case SIMPLE:
            return lwm - deltaNum;
        default:
            Date lowWaterMarkDate = Utils.toDate(lwm, "yyyyMMddHHmmss");
            return Long.parseLong(Utils.dateToString(Utils.addSecondsToDate(lowWaterMarkDate, deltaNum * -1), "yyyyMMddHHmmss"));
    }
}
Also used : WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) WatermarkPredicate(org.apache.gobblin.source.extractor.watermark.WatermarkPredicate) Date(java.util.Date)

Example 2 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class JdbcExtractor method getDefaultWatermark.

/**
 * Schema of default watermark column-required if there are multiple watermarks
 *
 * @return column schema
 */
private JsonObject getDefaultWatermark() {
    Schema schema = new Schema();
    String dataType;
    String columnName = "derivedwatermarkcolumn";
    schema.setColumnName(columnName);
    WatermarkType wmType = WatermarkType.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "TIMESTAMP").toUpperCase());
    switch(wmType) {
        case TIMESTAMP:
            dataType = "timestamp";
            break;
        case DATE:
            dataType = "date";
            break;
        default:
            dataType = "int";
            break;
    }
    String elementDataType = "string";
    List<String> mapSymbols = null;
    JsonObject newDataType = this.convertDataType(columnName, dataType, elementDataType, mapSymbols);
    schema.setDataType(newDataType);
    schema.setWaterMark(true);
    schema.setPrimaryKey(0);
    schema.setLength(0);
    schema.setPrecision(0);
    schema.setScale(0);
    schema.setNullable(false);
    schema.setFormat(null);
    schema.setComment("Default watermark column");
    schema.setDefaultValue(null);
    schema.setUnique(false);
    String jsonStr = gson.toJson(schema);
    JsonObject obj = gson.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
    return obj;
}
Also used : WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) Schema(org.apache.gobblin.source.extractor.schema.Schema) JsonObject(com.google.gson.JsonObject)

Example 3 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class SalesforceSource method generateWorkUnits.

@Override
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
    WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
    String watermarkColumn = state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
    int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
    int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE);
    // Only support time related watermark
    if (watermarkType == WatermarkType.SIMPLE || Strings.isNullOrEmpty(watermarkColumn) || !state.getPropAsBoolean(ENABLE_DYNAMIC_PARTITIONING) || maxPartitions <= 1) {
        return super.generateWorkUnits(sourceEntity, state, previousWatermark);
    }
    Partitioner partitioner = new Partitioner(state);
    if (isEarlyStopEnabled(state) && partitioner.isFullDump()) {
        throw new UnsupportedOperationException("Early stop mode cannot work with full dump mode.");
    }
    Partition partition = partitioner.getGlobalPartition(previousWatermark);
    Histogram histogram = getHistogram(sourceEntity.getSourceEntityName(), watermarkColumn, state, partition);
    // we should look if the count is too big, cut off early if count exceeds the limit, or bucket size is too large
    Histogram histogramAdjust;
    // TODO: we should consider move this logic into getRefinedHistogram so that we can early terminate the search
    if (isEarlyStopEnabled(state)) {
        histogramAdjust = new Histogram();
        for (HistogramGroup group : histogram.getGroups()) {
            histogramAdjust.add(group);
            if (histogramAdjust.getTotalRecordCount() > state.getPropAsLong(EARLY_STOP_TOTAL_RECORDS_LIMIT, DEFAULT_EARLY_STOP_TOTAL_RECORDS_LIMIT)) {
                break;
            }
        }
    } else {
        histogramAdjust = histogram;
    }
    long expectedHighWatermark = partition.getHighWatermark();
    if (histogramAdjust.getGroups().size() < histogram.getGroups().size()) {
        HistogramGroup lastPlusOne = histogram.get(histogramAdjust.getGroups().size());
        long earlyStopHighWatermark = Long.parseLong(Utils.toDateTimeFormat(lastPlusOne.getKey(), SECONDS_FORMAT, Partitioner.WATERMARKTIMEFORMAT));
        log.info("Job {} will be stopped earlier. [LW : {}, early-stop HW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), earlyStopHighWatermark, expectedHighWatermark);
        this.isEarlyStopped = true;
        expectedHighWatermark = earlyStopHighWatermark;
    } else {
        log.info("Job {} will be finished in a single run. [LW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), expectedHighWatermark);
    }
    String specifiedPartitions = generateSpecifiedPartitions(histogramAdjust, minTargetPartitionSize, maxPartitions, partition.getLowWatermark(), expectedHighWatermark);
    state.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
    state.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, specifiedPartitions);
    state.setProp(Partitioner.IS_EARLY_STOPPED, isEarlyStopped);
    return super.generateWorkUnits(sourceEntity, state, previousWatermark);
}
Also used : Partition(org.apache.gobblin.source.extractor.partition.Partition) WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) Partitioner(org.apache.gobblin.source.extractor.partition.Partitioner)

Example 4 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class Partitioner method createUserSpecifiedPartitions.

/**
 * Generate the partitions based on the lists specified by the user in job config
 */
private List<Partition> createUserSpecifiedPartitions() {
    List<Partition> partitions = new ArrayList<>();
    List<String> watermarkPoints = state.getPropAsList(USER_SPECIFIED_PARTITIONS);
    boolean isEarlyStopped = state.getPropAsBoolean(IS_EARLY_STOPPED);
    if (watermarkPoints == null || watermarkPoints.size() == 0) {
        LOG.info("There should be some partition points");
        long defaultWatermark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
        partitions.add(new Partition(defaultWatermark, defaultWatermark, true, true));
        return partitions;
    }
    WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
    long lowWatermark = adjustWatermark(watermarkPoints.get(0), watermarkType);
    long highWatermark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
    // Only one partition point specified
    if (watermarkPoints.size() == 1) {
        if (watermarkType != WatermarkType.SIMPLE) {
            String timeZone = this.state.getProp(ConfigurationKeys.SOURCE_TIMEZONE);
            String currentTime = Utils.dateTimeToString(getCurrentTime(timeZone), WATERMARKTIMEFORMAT, timeZone);
            highWatermark = adjustWatermark(currentTime, watermarkType);
        }
        partitions.add(new Partition(lowWatermark, highWatermark, true, false));
        return partitions;
    }
    int i;
    for (i = 1; i < watermarkPoints.size() - 1; i++) {
        highWatermark = adjustWatermark(watermarkPoints.get(i), watermarkType);
        partitions.add(new Partition(lowWatermark, highWatermark, true));
        lowWatermark = highWatermark;
    }
    // Last partition
    highWatermark = adjustWatermark(watermarkPoints.get(i), watermarkType);
    ExtractType extractType = ExtractType.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase());
    // If it is early stop, we should not remove upper bounds
    if ((isFullDump() || isSnapshot(extractType)) && !isEarlyStopped) {
        // The upper bounds can be removed for last work unit
        partitions.add(new Partition(lowWatermark, highWatermark, true, false));
    } else {
        // The upper bounds can not be removed for last work unit
        partitions.add(new Partition(lowWatermark, highWatermark, true, true));
    }
    return partitions;
}
Also used : WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) ArrayList(java.util.ArrayList) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType)

Example 5 with WatermarkType

use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.

the class Partitioner method getPartitions.

/**
 * Get partitions with low and high water marks
 *
 * @param previousWatermark previous water mark from metadata
 * @return map of partition intervals.
 *         map's key is interval begin time (in format {@link Partitioner#WATERMARKTIMEFORMAT})
 *         map's value is interval end time (in format {@link Partitioner#WATERMARKTIMEFORMAT})
 */
@Deprecated
public HashMap<Long, Long> getPartitions(long previousWatermark) {
    HashMap<Long, Long> defaultPartition = Maps.newHashMap();
    if (!isWatermarkExists()) {
        defaultPartition.put(ConfigurationKeys.DEFAULT_WATERMARK_VALUE, ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
        LOG.info("Watermark column or type not found - Default partition with low watermark and high watermark as " + ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
        return defaultPartition;
    }
    ExtractType extractType = ExtractType.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase());
    WatermarkType watermarkType = WatermarkType.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
    int interval = getUpdatedInterval(this.state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL, 0), extractType, watermarkType);
    int sourceMaxAllowedPartitions = this.state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 0);
    int maxPartitions = (sourceMaxAllowedPartitions != 0 ? sourceMaxAllowedPartitions : ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
    WatermarkPredicate watermark = new WatermarkPredicate(null, watermarkType);
    int deltaForNextWatermark = watermark.getDeltaNumForNextWatermark();
    LOG.info("is watermark override: " + this.isWatermarkOverride());
    LOG.info("is full extract: " + this.isFullDump());
    long lowWatermark = this.getLowWatermark(extractType, watermarkType, previousWatermark, deltaForNextWatermark);
    long highWatermark = this.getHighWatermark(extractType, watermarkType);
    if (lowWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE || highWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
        LOG.info("Low watermark or high water mark is not found. Hence cannot generate partitions - Default partition with low watermark:  " + lowWatermark + " and high watermark: " + highWatermark);
        defaultPartition.put(lowWatermark, highWatermark);
        return defaultPartition;
    }
    LOG.info("Generate partitions with low watermark: " + lowWatermark + "; high watermark: " + highWatermark + "; partition interval in hours: " + interval + "; Maximum number of allowed partitions: " + maxPartitions);
    return watermark.getPartitions(lowWatermark, highWatermark, interval, maxPartitions);
}
Also used : WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) WatermarkPredicate(org.apache.gobblin.source.extractor.watermark.WatermarkPredicate) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType)

Aggregations

WatermarkType (org.apache.gobblin.source.extractor.watermark.WatermarkType)7 ExtractType (org.apache.gobblin.source.extractor.extract.ExtractType)3 WatermarkPredicate (org.apache.gobblin.source.extractor.watermark.WatermarkPredicate)3 JsonObject (com.google.gson.JsonObject)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 DataRecordException (org.apache.gobblin.source.extractor.DataRecordException)1 ExtractPrepareException (org.apache.gobblin.source.extractor.exception.ExtractPrepareException)1 HighWatermarkException (org.apache.gobblin.source.extractor.exception.HighWatermarkException)1 RecordCountException (org.apache.gobblin.source.extractor.exception.RecordCountException)1 SchemaException (org.apache.gobblin.source.extractor.exception.SchemaException)1 Partition (org.apache.gobblin.source.extractor.partition.Partition)1 Partitioner (org.apache.gobblin.source.extractor.partition.Partitioner)1 Schema (org.apache.gobblin.source.extractor.schema.Schema)1