Search in sources :

Example 6 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class InstrumentedForkOperatorTest method testBase.

public void testBase(InstrumentedForkOperatorBase<String, String> fork) throws Exception {
    WorkUnitState state = new WorkUnitState();
    state.setProp(ConfigurationKeys.METRICS_ENABLED_KEY, Boolean.toString(true));
    fork.init(state);
    fork.forkDataRecord(new WorkUnitState(), "in");
    Map<String, Long> metrics = MetricsHelper.dumpMetrics(fork.getMetricContext());
    Assert.assertEquals(metrics.get(MetricNames.ForkOperatorMetrics.RECORDS_IN_METER), Long.valueOf(1));
    Assert.assertEquals(metrics.get(MetricNames.ForkOperatorMetrics.FORKS_OUT_METER), Long.valueOf(2));
    Assert.assertEquals(metrics.get(MetricNames.ForkOperatorMetrics.FORK_TIMER), Long.valueOf(1));
    Assert.assertEquals(MetricsHelper.dumpTags(fork.getMetricContext()).get("construct"), Constructs.FORK_OPERATOR.toString());
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState)

Example 7 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class InstrumentedRowLevelPolicyTest method test.

@Test
public void test() {
    WorkUnitState state = new WorkUnitState();
    state.setProp(ConfigurationKeys.METRICS_ENABLED_KEY, Boolean.toString(true));
    TestInstrumentedRowLevelPolicy policy = new TestInstrumentedRowLevelPolicy(state, null);
    testBase(policy);
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Test(org.testng.annotations.Test)

Example 8 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class SequentialTestSource method getExtractor.

@Override
public Extractor<String, Object> getExtractor(WorkUnitState state) throws IOException {
    Config config = ConfigFactory.parseProperties(state.getProperties());
    configureIfNeeded(config);
    final LongWatermark lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class);
    final WorkUnitState workUnitState = state;
    final int index = state.getPropAsInt(WORK_UNIT_INDEX);
    final TestBatchExtractor extractor = new TestBatchExtractor(index, lowWatermark, numRecordsPerExtract, sleepTimePerRecord, workUnitState);
    if (!streaming) {
        return extractor;
    } else {
        return (Extractor) new TestStreamingExtractor(extractor);
    }
}
Also used : Config(com.typesafe.config.Config) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) StreamingExtractor(org.apache.gobblin.source.extractor.StreamingExtractor) Extractor(org.apache.gobblin.source.extractor.Extractor) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 9 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class QueryBasedSource method getPreviousWatermarksForAllTables.

/**
 * For each table, if job commit policy is to commit on full success, and the table has failed tasks in the
 * previous run, return the lowest low watermark among all previous {@code WorkUnitState}s of the table.
 * Otherwise, return the highest high watermark among all previous {@code WorkUnitState}s of the table.
 */
static Map<SourceEntity, Long> getPreviousWatermarksForAllTables(SourceState state) {
    Map<SourceEntity, Long> result = Maps.newHashMap();
    Map<SourceEntity, Long> prevLowWatermarksByTable = Maps.newHashMap();
    Map<SourceEntity, Long> prevActualHighWatermarksByTable = Maps.newHashMap();
    Set<SourceEntity> tablesWithFailedTasks = Sets.newHashSet();
    Set<SourceEntity> tablesWithNoUpdatesOnPreviousRun = Sets.newHashSet();
    boolean commitOnFullSuccess = JobCommitPolicy.getCommitPolicy(state) == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS;
    for (WorkUnitState previousWus : state.getPreviousWorkUnitStates()) {
        Optional<SourceEntity> sourceEntity = SourceEntity.fromState(previousWus);
        if (!sourceEntity.isPresent()) {
            log.warn("Missing source entity for WorkUnit state: " + previousWus);
            continue;
        }
        SourceEntity table = sourceEntity.get();
        long lowWm = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
        LongWatermark waterMarkObj = previousWus.getWorkunit().getLowWatermark(LongWatermark.class);
        // "watermark.interval.value": "{\"low.watermark.to.json\":{\"value\":20160101000000},\"expected.watermark.to.json\":{\"value\":20160715230234}}",
        if (waterMarkObj != null) {
            lowWm = waterMarkObj.getValue();
        } else // "workunit.state.runtime.high.water.mark": "20160716140338",
        if (previousWus.getProperties().containsKey(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY)) {
            lowWm = Long.parseLong(previousWus.getProperties().getProperty(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY));
            log.warn("can not find low water mark in json format, getting value from " + ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY + " low water mark " + lowWm);
        }
        if (!prevLowWatermarksByTable.containsKey(table)) {
            prevLowWatermarksByTable.put(table, lowWm);
        } else {
            prevLowWatermarksByTable.put(table, Math.min(prevLowWatermarksByTable.get(table), lowWm));
        }
        long highWm = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
        waterMarkObj = previousWus.getActualHighWatermark(LongWatermark.class);
        if (waterMarkObj != null) {
            highWm = waterMarkObj.getValue();
        } else if (previousWus.getProperties().containsKey(ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK)) {
            highWm = Long.parseLong(previousWus.getProperties().getProperty(ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK));
            log.warn("can not find high water mark in json format, getting value from " + ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK + " high water mark " + highWm);
        }
        if (!prevActualHighWatermarksByTable.containsKey(table)) {
            prevActualHighWatermarksByTable.put(table, highWm);
        } else {
            prevActualHighWatermarksByTable.put(table, Math.max(prevActualHighWatermarksByTable.get(table), highWm));
        }
        if (commitOnFullSuccess && !isSuccessfulOrCommited(previousWus)) {
            tablesWithFailedTasks.add(table);
        }
        if (!isAnyDataProcessed(previousWus)) {
            tablesWithNoUpdatesOnPreviousRun.add(table);
        }
    }
    for (Map.Entry<SourceEntity, Long> entry : prevLowWatermarksByTable.entrySet()) {
        if (tablesWithFailedTasks.contains(entry.getKey())) {
            log.info("Resetting low watermark to {} because previous run failed.", entry.getValue());
            result.put(entry.getKey(), entry.getValue());
        } else if (tablesWithNoUpdatesOnPreviousRun.contains(entry.getKey()) && state.getPropAsBoolean(ConfigurationKeys.SOURCE_QUERYBASED_RESET_EMPTY_PARTITION_WATERMARK, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_RESET_EMPTY_PARTITION_WATERMARK)) {
            log.info("Resetting low watermakr to {} because previous run processed no data.", entry.getValue());
            result.put(entry.getKey(), entry.getValue());
        } else {
            result.put(entry.getKey(), prevActualHighWatermarksByTable.get(entry.getKey()));
        }
    }
    return result;
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) HashMap(java.util.HashMap) Map(java.util.Map)

Example 10 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class FileBasedSource method getWorkunits.

/**
 * This method takes the snapshot seen in the previous run, and compares it to the list
 * of files currently in the source - it then decided which files it needs to pull
 * and distributes those files across the workunits; it does this comparison by comparing
 * the names of the files currently in the source vs. the names retrieved from the
 * previous state
 * @param state is the source state
 * @return a list of workunits for the framework to run
 */
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    initLogger(state);
    try {
        initFileSystemHelper(state);
    } catch (FileBasedHelperException e) {
        Throwables.propagate(e);
    }
    log.info("Getting work units");
    String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    String entityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY);
    // Override extract table name
    String extractTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
    // If extract table name is not found then consider entity name as extract table name
    if (Strings.isNullOrEmpty(extractTableName)) {
        extractTableName = entityName;
    }
    TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase());
    List<WorkUnitState> previousWorkunits = Lists.newArrayList(state.getPreviousWorkUnitStates());
    Set<String> prevFsSnapshot = Sets.newHashSet();
    // Get list of files seen in the previous run
    if (!previousWorkunits.isEmpty()) {
        if (previousWorkunits.get(0).getWorkunit().contains(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT)) {
            prevFsSnapshot.addAll(previousWorkunits.get(0).getWorkunit().getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT));
        } else if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED, ConfigurationKeys.DEFAULT_SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED)) {
            // don't accidentally read files that have already been processed.
            throw new RuntimeException(String.format("No '%s' found on state of prior job", ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT));
        }
    }
    List<WorkUnit> workUnits = Lists.newArrayList();
    List<WorkUnit> previousWorkUnitsForRetry = this.getPreviousWorkUnitsForRetry(state);
    log.info("Total number of work units from the previous failed runs: " + previousWorkUnitsForRetry.size());
    for (WorkUnit previousWorkUnitForRetry : previousWorkUnitsForRetry) {
        prevFsSnapshot.addAll(previousWorkUnitForRetry.getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
        workUnits.add(previousWorkUnitForRetry);
    }
    // Get list of files that need to be pulled
    List<String> currentFsSnapshot = this.getcurrentFsSnapshot(state);
    // The snapshot we want to save. This might not be the full snapshot if we don't pull all files.
    List<String> effectiveSnapshot = Lists.newArrayList();
    List<String> filesToPull = Lists.newArrayList();
    int maxFilesToPull = state.getPropAsInt(ConfigurationKeys.SOURCE_FILEBASED_MAX_FILES_PER_RUN, Integer.MAX_VALUE);
    int filesSelectedForPull = 0;
    if (currentFsSnapshot.size() > maxFilesToPull) {
        // if we're going to not pull all files, sort them lexicographically so there is some order in which they are ingested
        // note currentFsSnapshot.size > maxFilesToPull does not imply we will ignore some of them, as we still have to diff
        // against the previous snapshot. Just a quick check if it even makes sense to sort the files.
        Collections.sort(currentFsSnapshot);
    }
    for (String file : currentFsSnapshot) {
        if (prevFsSnapshot.contains(file)) {
            effectiveSnapshot.add(file);
        } else if ((filesSelectedForPull++) < maxFilesToPull) {
            filesToPull.add(file.split(this.splitPattern)[0]);
            effectiveSnapshot.add(file);
        } else {
        // file is not pulled this run
        }
    }
    // Otherwise a corrupt file could cause re-processing of already processed files
    for (WorkUnit workUnit : previousWorkUnitsForRetry) {
        workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT, StringUtils.join(effectiveSnapshot, ","));
    }
    if (!filesToPull.isEmpty()) {
        logFilesToPull(filesToPull);
        int numPartitions = state.contains(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) && state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) <= filesToPull.size() ? state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) : filesToPull.size();
        if (numPartitions <= 0) {
            throw new IllegalArgumentException("The number of partitions should be positive");
        }
        int filesPerPartition = filesToPull.size() % numPartitions == 0 ? filesToPull.size() / numPartitions : filesToPull.size() / numPartitions + 1;
        // Distribute the files across the workunits
        for (int fileOffset = 0; fileOffset < filesToPull.size(); fileOffset += filesPerPartition) {
            // Use extract table name to create extract
            Extract extract = new Extract(tableType, nameSpaceName, extractTableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            // Eventually these setters should be integrated with framework support for generalized watermark handling
            workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT, StringUtils.join(effectiveSnapshot, ","));
            List<String> partitionFilesToPull = filesToPull.subList(fileOffset, fileOffset + filesPerPartition > filesToPull.size() ? filesToPull.size() : fileOffset + filesPerPartition);
            workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, StringUtils.join(partitionFilesToPull, ","));
            if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, false)) {
                if (partitionFilesToPull.size() != 1) {
                    throw new RuntimeException("Cannot preserve the file name if a workunit is given multiple files");
                }
                workUnit.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
            }
            workUnits.add(workUnit);
        }
        log.info("Total number of work units for the current run: " + (workUnits.size() - previousWorkUnitsForRetry.size()));
    }
    return workUnits;
}
Also used : TableType(org.apache.gobblin.source.workunit.Extract.TableType) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Aggregations

WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)222 Test (org.testng.annotations.Test)143 State (org.apache.gobblin.configuration.State)48 SourceState (org.apache.gobblin.configuration.SourceState)39 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)39 Schema (org.apache.avro.Schema)29 Path (org.apache.hadoop.fs.Path)26 GenericRecord (org.apache.avro.generic.GenericRecord)19 JsonObject (com.google.gson.JsonObject)17 ArrayList (java.util.ArrayList)16 File (java.io.File)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)12 List (java.util.List)11 Configuration (org.apache.hadoop.conf.Configuration)11 IOException (java.io.IOException)10 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)10 Extract (org.apache.gobblin.source.workunit.Extract)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Closer (com.google.common.io.Closer)8 JsonParser (com.google.gson.JsonParser)8