use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.
the class QueryBasedExtractor method getLowWatermarkWithNoDelta.
private long getLowWatermarkWithNoDelta(long lwm) {
if (lwm == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
return ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
}
String watermarkType = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "TIMESTAMP");
WatermarkType wmType = WatermarkType.valueOf(watermarkType.toUpperCase());
int deltaNum = new WatermarkPredicate(wmType).getDeltaNumForNextWatermark();
switch(wmType) {
case SIMPLE:
return lwm - deltaNum;
default:
Date lowWaterMarkDate = Utils.toDate(lwm, "yyyyMMddHHmmss");
return Long.parseLong(Utils.dateToString(Utils.addSecondsToDate(lowWaterMarkDate, deltaNum * -1), "yyyyMMddHHmmss"));
}
}
use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.
the class JdbcExtractor method getDefaultWatermark.
/**
* Schema of default watermark column-required if there are multiple watermarks
*
* @return column schema
*/
private JsonObject getDefaultWatermark() {
Schema schema = new Schema();
String dataType;
String columnName = "derivedwatermarkcolumn";
schema.setColumnName(columnName);
WatermarkType wmType = WatermarkType.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "TIMESTAMP").toUpperCase());
switch(wmType) {
case TIMESTAMP:
dataType = "timestamp";
break;
case DATE:
dataType = "date";
break;
default:
dataType = "int";
break;
}
String elementDataType = "string";
List<String> mapSymbols = null;
JsonObject newDataType = this.convertDataType(columnName, dataType, elementDataType, mapSymbols);
schema.setDataType(newDataType);
schema.setWaterMark(true);
schema.setPrimaryKey(0);
schema.setLength(0);
schema.setPrecision(0);
schema.setScale(0);
schema.setNullable(false);
schema.setFormat(null);
schema.setComment("Default watermark column");
schema.setDefaultValue(null);
schema.setUnique(false);
String jsonStr = gson.toJson(schema);
JsonObject obj = gson.fromJson(jsonStr, JsonObject.class).getAsJsonObject();
return obj;
}
use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.
the class SalesforceSource method generateWorkUnits.
@Override
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
String watermarkColumn = state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE);
// Only support time related watermark
if (watermarkType == WatermarkType.SIMPLE || Strings.isNullOrEmpty(watermarkColumn) || !state.getPropAsBoolean(ENABLE_DYNAMIC_PARTITIONING) || maxPartitions <= 1) {
return super.generateWorkUnits(sourceEntity, state, previousWatermark);
}
Partitioner partitioner = new Partitioner(state);
if (isEarlyStopEnabled(state) && partitioner.isFullDump()) {
throw new UnsupportedOperationException("Early stop mode cannot work with full dump mode.");
}
Partition partition = partitioner.getGlobalPartition(previousWatermark);
Histogram histogram = getHistogram(sourceEntity.getSourceEntityName(), watermarkColumn, state, partition);
// we should look if the count is too big, cut off early if count exceeds the limit, or bucket size is too large
Histogram histogramAdjust;
// TODO: we should consider move this logic into getRefinedHistogram so that we can early terminate the search
if (isEarlyStopEnabled(state)) {
histogramAdjust = new Histogram();
for (HistogramGroup group : histogram.getGroups()) {
histogramAdjust.add(group);
if (histogramAdjust.getTotalRecordCount() > state.getPropAsLong(EARLY_STOP_TOTAL_RECORDS_LIMIT, DEFAULT_EARLY_STOP_TOTAL_RECORDS_LIMIT)) {
break;
}
}
} else {
histogramAdjust = histogram;
}
long expectedHighWatermark = partition.getHighWatermark();
if (histogramAdjust.getGroups().size() < histogram.getGroups().size()) {
HistogramGroup lastPlusOne = histogram.get(histogramAdjust.getGroups().size());
long earlyStopHighWatermark = Long.parseLong(Utils.toDateTimeFormat(lastPlusOne.getKey(), SECONDS_FORMAT, Partitioner.WATERMARKTIMEFORMAT));
log.info("Job {} will be stopped earlier. [LW : {}, early-stop HW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), earlyStopHighWatermark, expectedHighWatermark);
this.isEarlyStopped = true;
expectedHighWatermark = earlyStopHighWatermark;
} else {
log.info("Job {} will be finished in a single run. [LW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), expectedHighWatermark);
}
String specifiedPartitions = generateSpecifiedPartitions(histogramAdjust, minTargetPartitionSize, maxPartitions, partition.getLowWatermark(), expectedHighWatermark);
state.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
state.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, specifiedPartitions);
state.setProp(Partitioner.IS_EARLY_STOPPED, isEarlyStopped);
return super.generateWorkUnits(sourceEntity, state, previousWatermark);
}
use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.
the class Partitioner method createUserSpecifiedPartitions.
/**
* Generate the partitions based on the lists specified by the user in job config
*/
private List<Partition> createUserSpecifiedPartitions() {
List<Partition> partitions = new ArrayList<>();
List<String> watermarkPoints = state.getPropAsList(USER_SPECIFIED_PARTITIONS);
boolean isEarlyStopped = state.getPropAsBoolean(IS_EARLY_STOPPED);
if (watermarkPoints == null || watermarkPoints.size() == 0) {
LOG.info("There should be some partition points");
long defaultWatermark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
partitions.add(new Partition(defaultWatermark, defaultWatermark, true, true));
return partitions;
}
WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
long lowWatermark = adjustWatermark(watermarkPoints.get(0), watermarkType);
long highWatermark = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
// Only one partition point specified
if (watermarkPoints.size() == 1) {
if (watermarkType != WatermarkType.SIMPLE) {
String timeZone = this.state.getProp(ConfigurationKeys.SOURCE_TIMEZONE);
String currentTime = Utils.dateTimeToString(getCurrentTime(timeZone), WATERMARKTIMEFORMAT, timeZone);
highWatermark = adjustWatermark(currentTime, watermarkType);
}
partitions.add(new Partition(lowWatermark, highWatermark, true, false));
return partitions;
}
int i;
for (i = 1; i < watermarkPoints.size() - 1; i++) {
highWatermark = adjustWatermark(watermarkPoints.get(i), watermarkType);
partitions.add(new Partition(lowWatermark, highWatermark, true));
lowWatermark = highWatermark;
}
// Last partition
highWatermark = adjustWatermark(watermarkPoints.get(i), watermarkType);
ExtractType extractType = ExtractType.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase());
// If it is early stop, we should not remove upper bounds
if ((isFullDump() || isSnapshot(extractType)) && !isEarlyStopped) {
// The upper bounds can be removed for last work unit
partitions.add(new Partition(lowWatermark, highWatermark, true, false));
} else {
// The upper bounds can not be removed for last work unit
partitions.add(new Partition(lowWatermark, highWatermark, true, true));
}
return partitions;
}
use of org.apache.gobblin.source.extractor.watermark.WatermarkType in project incubator-gobblin by apache.
the class Partitioner method getPartitions.
/**
* Get partitions with low and high water marks
*
* @param previousWatermark previous water mark from metadata
* @return map of partition intervals.
* map's key is interval begin time (in format {@link Partitioner#WATERMARKTIMEFORMAT})
* map's value is interval end time (in format {@link Partitioner#WATERMARKTIMEFORMAT})
*/
@Deprecated
public HashMap<Long, Long> getPartitions(long previousWatermark) {
HashMap<Long, Long> defaultPartition = Maps.newHashMap();
if (!isWatermarkExists()) {
defaultPartition.put(ConfigurationKeys.DEFAULT_WATERMARK_VALUE, ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
LOG.info("Watermark column or type not found - Default partition with low watermark and high watermark as " + ConfigurationKeys.DEFAULT_WATERMARK_VALUE);
return defaultPartition;
}
ExtractType extractType = ExtractType.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase());
WatermarkType watermarkType = WatermarkType.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
int interval = getUpdatedInterval(this.state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL, 0), extractType, watermarkType);
int sourceMaxAllowedPartitions = this.state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 0);
int maxPartitions = (sourceMaxAllowedPartitions != 0 ? sourceMaxAllowedPartitions : ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
WatermarkPredicate watermark = new WatermarkPredicate(null, watermarkType);
int deltaForNextWatermark = watermark.getDeltaNumForNextWatermark();
LOG.info("is watermark override: " + this.isWatermarkOverride());
LOG.info("is full extract: " + this.isFullDump());
long lowWatermark = this.getLowWatermark(extractType, watermarkType, previousWatermark, deltaForNextWatermark);
long highWatermark = this.getHighWatermark(extractType, watermarkType);
if (lowWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE || highWatermark == ConfigurationKeys.DEFAULT_WATERMARK_VALUE) {
LOG.info("Low watermark or high water mark is not found. Hence cannot generate partitions - Default partition with low watermark: " + lowWatermark + " and high watermark: " + highWatermark);
defaultPartition.put(lowWatermark, highWatermark);
return defaultPartition;
}
LOG.info("Generate partitions with low watermark: " + lowWatermark + "; high watermark: " + highWatermark + "; partition interval in hours: " + interval + "; Maximum number of allowed partitions: " + maxPartitions);
return watermark.getPartitions(lowWatermark, highWatermark, interval, maxPartitions);
}
Aggregations