Search in sources :

Example 1 with Partition

use of org.apache.gobblin.source.extractor.partition.Partition in project incubator-gobblin by apache.

the class QueryBasedSource method generateWorkUnits.

protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
    List<WorkUnit> workUnits = Lists.newArrayList();
    String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
    List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark);
    Collections.sort(partitions, Partitioner.ascendingComparator);
    // {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract
    String outputTableName = sourceEntity.getDestTableName();
    log.info("Create extract output with table name is " + outputTableName);
    Extract extract = createExtract(tableType, nameSpaceName, outputTableName);
    // Setting current time for the full extract
    if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) {
        extract.setFullTrue(System.currentTimeMillis());
    }
    for (Partition partition : partitions) {
        WorkUnit workunit = WorkUnit.create(extract);
        workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName());
        workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName());
        workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION);
        addLineageSourceInfo(state, sourceEntity, workunit);
        partition.serialize(workunit);
        workUnits.add(workunit);
    }
    return workUnits;
}
Also used : Partition(org.apache.gobblin.source.extractor.partition.Partition) TableType(org.apache.gobblin.source.workunit.Extract.TableType) Extract(org.apache.gobblin.source.workunit.Extract) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Partitioner(org.apache.gobblin.source.extractor.partition.Partitioner)

Example 2 with Partition

use of org.apache.gobblin.source.extractor.partition.Partition in project incubator-gobblin by apache.

the class SalesforceSource method generateWorkUnits.

@Override
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) {
    WatermarkType watermarkType = WatermarkType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE).toUpperCase());
    String watermarkColumn = state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY);
    int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
    int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE);
    // Only support time related watermark
    if (watermarkType == WatermarkType.SIMPLE || Strings.isNullOrEmpty(watermarkColumn) || !state.getPropAsBoolean(ENABLE_DYNAMIC_PARTITIONING) || maxPartitions <= 1) {
        return super.generateWorkUnits(sourceEntity, state, previousWatermark);
    }
    Partitioner partitioner = new Partitioner(state);
    if (isEarlyStopEnabled(state) && partitioner.isFullDump()) {
        throw new UnsupportedOperationException("Early stop mode cannot work with full dump mode.");
    }
    Partition partition = partitioner.getGlobalPartition(previousWatermark);
    Histogram histogram = getHistogram(sourceEntity.getSourceEntityName(), watermarkColumn, state, partition);
    // we should look if the count is too big, cut off early if count exceeds the limit, or bucket size is too large
    Histogram histogramAdjust;
    // TODO: we should consider move this logic into getRefinedHistogram so that we can early terminate the search
    if (isEarlyStopEnabled(state)) {
        histogramAdjust = new Histogram();
        for (HistogramGroup group : histogram.getGroups()) {
            histogramAdjust.add(group);
            if (histogramAdjust.getTotalRecordCount() > state.getPropAsLong(EARLY_STOP_TOTAL_RECORDS_LIMIT, DEFAULT_EARLY_STOP_TOTAL_RECORDS_LIMIT)) {
                break;
            }
        }
    } else {
        histogramAdjust = histogram;
    }
    long expectedHighWatermark = partition.getHighWatermark();
    if (histogramAdjust.getGroups().size() < histogram.getGroups().size()) {
        HistogramGroup lastPlusOne = histogram.get(histogramAdjust.getGroups().size());
        long earlyStopHighWatermark = Long.parseLong(Utils.toDateTimeFormat(lastPlusOne.getKey(), SECONDS_FORMAT, Partitioner.WATERMARKTIMEFORMAT));
        log.info("Job {} will be stopped earlier. [LW : {}, early-stop HW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), earlyStopHighWatermark, expectedHighWatermark);
        this.isEarlyStopped = true;
        expectedHighWatermark = earlyStopHighWatermark;
    } else {
        log.info("Job {} will be finished in a single run. [LW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), expectedHighWatermark);
    }
    String specifiedPartitions = generateSpecifiedPartitions(histogramAdjust, minTargetPartitionSize, maxPartitions, partition.getLowWatermark(), expectedHighWatermark);
    state.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
    state.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, specifiedPartitions);
    state.setProp(Partitioner.IS_EARLY_STOPPED, isEarlyStopped);
    return super.generateWorkUnits(sourceEntity, state, previousWatermark);
}
Also used : Partition(org.apache.gobblin.source.extractor.partition.Partition) WatermarkType(org.apache.gobblin.source.extractor.watermark.WatermarkType) Partitioner(org.apache.gobblin.source.extractor.partition.Partitioner)

Example 3 with Partition

use of org.apache.gobblin.source.extractor.partition.Partition in project incubator-gobblin by apache.

the class GoogleWebMasterSourceDaily method createExtractor.

@Override
GoogleWebmasterExtractor createExtractor(WorkUnitState state, Map<String, Integer> columnPositionMap, List<GoogleWebmasterFilter.Dimension> requestedDimensions, List<GoogleWebmasterDataFetcher.Metric> requestedMetrics, JsonArray schemaJson) throws IOException {
    Preconditions.checkArgument(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).compareToIgnoreCase("Hour") == 0);
    Preconditions.checkArgument(state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL) == 24);
    Partition partition = Partition.deserialize(state.getWorkunit());
    long lowWatermark = partition.getLowWatermark();
    long expectedHighWatermark = partition.getHighWatermark();
    /*
      This change is needed because
      1. The partition behavior changed due to commit 7d730fcb0263b8ca820af0366818160d638d1336 [7d730fc]
       by zxcware <zxcware@gmail.com> on April 3, 2017 at 11:47:41 AM PDT
      2. Google Search Console API only cares about Dates, and are both side inclusive.
      Therefore, do the following processing.
     */
    int dateDiff = partition.isHighWatermarkInclusive() ? 1 : 0;
    long highWatermarkDate = DateWatermark.adjustWatermark(Long.toString(expectedHighWatermark), dateDiff);
    long updatedExpectedHighWatermark = TimestampWatermark.adjustWatermark(Long.toString(highWatermarkDate), -1);
    updatedExpectedHighWatermark = Math.max(lowWatermark, updatedExpectedHighWatermark);
    GoogleWebmasterClientImpl gscClient = new GoogleWebmasterClientImpl(getCredential(state), state.getProp(ConfigurationKeys.SOURCE_ENTITY));
    return new GoogleWebmasterExtractor(gscClient, state, lowWatermark, updatedExpectedHighWatermark, columnPositionMap, requestedDimensions, requestedMetrics, schemaJson);
}
Also used : Partition(org.apache.gobblin.source.extractor.partition.Partition)

Aggregations

Partition (org.apache.gobblin.source.extractor.partition.Partition)3 Partitioner (org.apache.gobblin.source.extractor.partition.Partitioner)2 WatermarkType (org.apache.gobblin.source.extractor.watermark.WatermarkType)1 Extract (org.apache.gobblin.source.workunit.Extract)1 TableType (org.apache.gobblin.source.workunit.Extract.TableType)1 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)1 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)1