use of org.apache.gobblin.source.extractor.partition.Partition in project incubator-gobblin by apache.
the class QueryBasedSource method generateWorkUnits.
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
List<WorkUnit> workUnits = Lists.newArrayList();
String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark);
Collections.sort(partitions, Partitioner.ascendingComparator);
// {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract
String outputTableName = sourceEntity.getDestTableName();
log.info("Create extract output with table name is " + outputTableName);
Extract extract = createExtract(tableType, nameSpaceName, outputTableName);
// Setting current time for the full extract
if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) {
extract.setFullTrue(System.currentTimeMillis());
}
for (Partition partition : partitions) {
WorkUnit workunit = WorkUnit.create(extract);
workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName());
workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName());
workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION);
addLineageSourceInfo(state, sourceEntity, workunit);
partition.serialize(workunit);
workUnits.add(workunit);
}
return workUnits;
}
use of org.apache.gobblin.source.extractor.partition.Partition in project incubator-gobblin by apache.
the class SalesforceSource method generateWorkUnits.
@Override
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
String watermarkColumn = state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE);
// Only support time related watermark
if (watermarkType == WatermarkType.SIMPLE || Strings.isNullOrEmpty(watermarkColumn) || !state.getPropAsBoolean(ENABLE_DYNAMIC_PARTITIONING) || maxPartitions <= 1) {
return super.generateWorkUnits(sourceEntity, state, previousWatermark);
}
Partitioner partitioner = new Partitioner(state);
if (isEarlyStopEnabled(state) && partitioner.isFullDump()) {
throw new UnsupportedOperationException("Early stop mode cannot work with full dump mode.");
}
Partition partition = partitioner.getGlobalPartition(previousWatermark);
Histogram histogram = getHistogram(sourceEntity.getSourceEntityName(), watermarkColumn, state, partition);
// we should look if the count is too big, cut off early if count exceeds the limit, or bucket size is too large
Histogram histogramAdjust;
// TODO: we should consider move this logic into getRefinedHistogram so that we can early terminate the search
if (isEarlyStopEnabled(state)) {
histogramAdjust = new Histogram();
for (HistogramGroup group : histogram.getGroups()) {
histogramAdjust.add(group);
if (histogramAdjust.getTotalRecordCount() > state.getPropAsLong(EARLY_STOP_TOTAL_RECORDS_LIMIT, DEFAULT_EARLY_STOP_TOTAL_RECORDS_LIMIT)) {
break;
}
}
} else {
histogramAdjust = histogram;
}
long expectedHighWatermark = partition.getHighWatermark();
if (histogramAdjust.getGroups().size() < histogram.getGroups().size()) {
HistogramGroup lastPlusOne = histogram.get(histogramAdjust.getGroups().size());
long earlyStopHighWatermark = Long.parseLong(Utils.toDateTimeFormat(lastPlusOne.getKey(), SECONDS_FORMAT, Partitioner.WATERMARKTIMEFORMAT));
log.info("Job {} will be stopped earlier. [LW : {}, early-stop HW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), earlyStopHighWatermark, expectedHighWatermark);
this.isEarlyStopped = true;
expectedHighWatermark = earlyStopHighWatermark;
} else {
log.info("Job {} will be finished in a single run. [LW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), expectedHighWatermark);
}
String specifiedPartitions = generateSpecifiedPartitions(histogramAdjust, minTargetPartitionSize, maxPartitions, partition.getLowWatermark(), expectedHighWatermark);
state.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
state.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, specifiedPartitions);
state.setProp(Partitioner.IS_EARLY_STOPPED, isEarlyStopped);
return super.generateWorkUnits(sourceEntity, state, previousWatermark);
}
use of org.apache.gobblin.source.extractor.partition.Partition in project incubator-gobblin by apache.
the class GoogleWebMasterSourceDaily method createExtractor.
@Override
GoogleWebmasterExtractor createExtractor(WorkUnitState state, Map<String, Integer> columnPositionMap, List<GoogleWebmasterFilter.Dimension> requestedDimensions, List<GoogleWebmasterDataFetcher.Metric> requestedMetrics, JsonArray schemaJson) throws IOException {
Preconditions.checkArgument(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).compareToIgnoreCase("Hour") == 0);
Preconditions.checkArgument(state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL) == 24);
Partition partition = Partition.deserialize(state.getWorkunit());
long lowWatermark = partition.getLowWatermark();
long expectedHighWatermark = partition.getHighWatermark();
/*
This change is needed because
1. The partition behavior changed due to commit 7d730fcb0263b8ca820af0366818160d638d1336 [7d730fc]
by zxcware <zxcware@gmail.com> on April 3, 2017 at 11:47:41 AM PDT
2. Google Search Console API only cares about Dates, and are both side inclusive.
Therefore, do the following processing.
*/
int dateDiff = partition.isHighWatermarkInclusive() ? 1 : 0;
long highWatermarkDate = DateWatermark.adjustWatermark(Long.toString(expectedHighWatermark), dateDiff);
long updatedExpectedHighWatermark = TimestampWatermark.adjustWatermark(Long.toString(highWatermarkDate), -1);
updatedExpectedHighWatermark = Math.max(lowWatermark, updatedExpectedHighWatermark);
GoogleWebmasterClientImpl gscClient = new GoogleWebmasterClientImpl(getCredential(state), state.getProp(ConfigurationKeys.SOURCE_ENTITY));
return new GoogleWebmasterExtractor(gscClient, state, lowWatermark, updatedExpectedHighWatermark, columnPositionMap, requestedDimensions, requestedMetrics, schemaJson);
}
Aggregations