Search in sources :

Example 1 with JobCommitPolicy

use of org.apache.gobblin.source.extractor.JobCommitPolicy in project incubator-gobblin by apache.

the class Task method shouldPublishDataInTask.

/**
 * Whether the task should directly publish its output data to the final publisher output directory.
 *
 * <p>
 *   The task should publish its output data directly if {@link ConfigurationKeys#PUBLISH_DATA_AT_JOB_LEVEL}
 *   is set to false AND any of the following conditions is satisfied:
 *
 *   <ul>
 *     <li>The {@link JobCommitPolicy#COMMIT_ON_PARTIAL_SUCCESS} policy is used.</li>
 *     <li>The {@link JobCommitPolicy#COMMIT_SUCCESSFUL_TASKS} policy is used. and all {@link Fork}s of this
 *     {@link Task} succeeded.</li>
 *   </ul>
 * </p>
 */
private boolean shouldPublishDataInTask() {
    boolean publishDataAtJobLevel = this.taskState.getPropAsBoolean(ConfigurationKeys.PUBLISH_DATA_AT_JOB_LEVEL, ConfigurationKeys.DEFAULT_PUBLISH_DATA_AT_JOB_LEVEL);
    if (publishDataAtJobLevel) {
        LOG.info(String.format("%s is true. Will publish data at the job level.", ConfigurationKeys.PUBLISH_DATA_AT_JOB_LEVEL));
        return false;
    }
    JobCommitPolicy jobCommitPolicy = JobCommitPolicy.getCommitPolicy(this.taskState);
    if (jobCommitPolicy == JobCommitPolicy.COMMIT_SUCCESSFUL_TASKS) {
        return this.taskState.getWorkingState() == WorkUnitState.WorkingState.SUCCESSFUL;
    }
    if (jobCommitPolicy == JobCommitPolicy.COMMIT_ON_PARTIAL_SUCCESS) {
        return true;
    }
    LOG.info("Will publish data at the job level with job commit policy: " + jobCommitPolicy);
    return false;
}
Also used : JobCommitPolicy(org.apache.gobblin.source.extractor.JobCommitPolicy)

Example 2 with JobCommitPolicy

use of org.apache.gobblin.source.extractor.JobCommitPolicy in project incubator-gobblin by apache.

the class JdbcWriterInitializer method validateInput.

/**
 * 1. User should not define same destination table across different branches.
 * 2. User should not define same staging table across different branches.
 * 3. If commit policy is not full, Gobblin will try to write into final table even there's a failure. This will let Gobblin to write in task level.
 *    However, publish data at job level is true, it contradicts with the behavior of Gobblin writing in task level. Thus, validate publish data at job level is false if commit policy is not full.
 * @param state
 */
private static void validateInput(State state) {
    int branches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1);
    Set<String> publishTables = Sets.newHashSet();
    for (int branchId = 0; branchId < branches; branchId++) {
        String publishTable = Preconditions.checkNotNull(getProp(state, JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME, branches, branchId), JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME + " should not be null.");
        if (publishTables.contains(publishTable)) {
            throw new IllegalArgumentException("Duplicate " + JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME + " is not allowed across branches");
        }
        publishTables.add(publishTable);
    }
    Set<String> stagingTables = Sets.newHashSet();
    for (int branchId = 0; branchId < branches; branchId++) {
        String stagingTable = getProp(state, ConfigurationKeys.WRITER_STAGING_TABLE, branches, branchId);
        if (!StringUtils.isEmpty(stagingTable) && stagingTables.contains(stagingTable)) {
            throw new IllegalArgumentException("Duplicate " + ConfigurationKeys.WRITER_STAGING_TABLE + " is not allowed across branches");
        }
        stagingTables.add(stagingTable);
    }
    JobCommitPolicy policy = JobCommitPolicy.getCommitPolicy(state);
    boolean isPublishJobLevel = state.getPropAsBoolean(ConfigurationKeys.PUBLISH_DATA_AT_JOB_LEVEL, ConfigurationKeys.DEFAULT_PUBLISH_DATA_AT_JOB_LEVEL);
    if (JobCommitPolicy.COMMIT_ON_FULL_SUCCESS.equals(policy) ^ isPublishJobLevel) {
        throw new IllegalArgumentException("Job commit policy should be only " + JobCommitPolicy.COMMIT_ON_FULL_SUCCESS + " when " + ConfigurationKeys.PUBLISH_DATA_AT_JOB_LEVEL + " is true." + " Or Job commit policy should not be " + JobCommitPolicy.COMMIT_ON_FULL_SUCCESS + " and " + ConfigurationKeys.PUBLISH_DATA_AT_JOB_LEVEL + " is false.");
    }
}
Also used : JobCommitPolicy(org.apache.gobblin.source.extractor.JobCommitPolicy) ToString(lombok.ToString)

Example 3 with JobCommitPolicy

use of org.apache.gobblin.source.extractor.JobCommitPolicy in project incubator-gobblin by apache.

the class JdbcWriterInitializer method initialize.

/**
 * Initializes AvroFileJdbcSource for Writer that needs to be happen in single threaded environment.
 * On each branch:
 * 1. Check if user chose to skip the staging table
 * 1.1. If user chose to skip the staging table, and user decided to replace final table, truncate final table.
 * 2. (User didn't choose to skip the staging table.) Check if user passed the staging table.
 * 2.1. Truncate staging table, if requested.
 * 2.2. Confirm if staging table is empty.
 * 3. Create staging table (At this point user hasn't passed the staging table, and not skipping staging table).
 * 3.1. Create staging table with unique name.
 * 3.2. Try to drop and recreate the table to confirm if we can drop it later.
 * 4. Update Workunit state with staging table information.
 * @param state
 */
@Override
public void initialize() {
    try (Connection conn = createConnection()) {
        JdbcWriterCommands commands = createJdbcWriterCommands(conn);
        // 1. Check if user chose to skip the staging table
        JobCommitPolicy jobCommitPolicy = JobCommitPolicy.getCommitPolicy(this.state);
        boolean isSkipStaging = !JobCommitPolicy.COMMIT_ON_FULL_SUCCESS.equals(jobCommitPolicy);
        if (isSkipStaging) {
            LOG.info("Writer will write directly to destination table as JobCommitPolicy is " + jobCommitPolicy);
        }
        final String publishTable = getProp(this.state, JdbcPublisher.JDBC_PUBLISHER_FINAL_TABLE_NAME, this.branches, this.branchId);
        final String stagingTableKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_TABLE, this.branches, this.branchId);
        String stagingTable = this.state.getProp(stagingTableKey);
        int i = -1;
        for (WorkUnit wu : this.workUnits) {
            i++;
            if (isSkipStaging) {
                LOG.info("User chose to skip staing table on branch " + this.branchId + " workunit " + i);
                wu.setProp(stagingTableKey, publishTable);
                if (i == 0) {
                    // 1.1. If user chose to skip the staging table, and user decided to replace final table, truncate final table.
                    if (getPropAsBoolean(this.state, JdbcPublisher.JDBC_PUBLISHER_REPLACE_FINAL_TABLE, this.branches, this.branchId)) {
                        LOG.info("User chose to replace final table " + publishTable + " on branch " + this.branchId + " workunit " + i);
                        commands.truncate(database, publishTable);
                    }
                }
                continue;
            }
            // 2. (User didn't choose to skip the staging table.) Check if user passed the staging table.
            if (!StringUtils.isEmpty(stagingTable)) {
                LOG.info("Staging table for branch " + this.branchId + " from user: " + stagingTable);
                wu.setProp(stagingTableKey, stagingTable);
                if (i == 0) {
                    // 2.1. Truncate staging table, if requested.
                    if (this.state.getPropAsBoolean(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_TRUNCATE_STAGING_TABLE, this.branches, this.branchId), false)) {
                        LOG.info("Truncating staging table " + stagingTable + " as requested.");
                        commands.truncate(database, stagingTable);
                    }
                    // 2.2. Confirm if staging table is empty.
                    if (!commands.isEmpty(database, stagingTable)) {
                        LOG.error("Staging table " + stagingTable + " is not empty. Failing.");
                        throw new IllegalArgumentException("Staging table " + stagingTable + " should be empty.");
                    }
                    this.userCreatedStagingTable = stagingTable;
                }
                continue;
            }
            // 3. Create staging table (At this point user hasn't passed the staging table, and not skipping staging table).
            LOG.info("Staging table has not been passed from user for branch " + this.branchId + " workunit " + i + " . Creating.");
            String createdStagingTable = createStagingTable(conn, commands);
            wu.setProp(stagingTableKey, createdStagingTable);
            this.createdStagingTables.add(createdStagingTable);
            LOG.info("Staging table " + createdStagingTable + " has been created for branchId " + this.branchId + " workunit " + i);
        }
    } catch (SQLException e) {
        throw new RuntimeException("Failed with SQL", e);
    }
}
Also used : SQLException(java.sql.SQLException) JdbcWriterCommands(org.apache.gobblin.writer.commands.JdbcWriterCommands) JobCommitPolicy(org.apache.gobblin.source.extractor.JobCommitPolicy) Connection(java.sql.Connection) ToString(lombok.ToString) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 4 with JobCommitPolicy

use of org.apache.gobblin.source.extractor.JobCommitPolicy in project incubator-gobblin by apache.

the class AbstractSource method getPreviousWorkUnitStatesForRetry.

/**
 * Get a list of {@link WorkUnitState}s of previous {@link WorkUnit}s subject for retries.
 *
 * <p>
 *     We use two keys for configuring work unit retries. The first one specifies
 *     whether work unit retries are enabled or not. This is for individual jobs
 *     or a group of jobs that following the same rule for work unit retries.
 *     The second one that is more advanced is for specifying a retry policy.
 *     This one is particularly useful for being a global policy for a group of
 *     jobs that have different job commit policies and want work unit retries only
 *     for a specific job commit policy. The first one probably is sufficient for
 *     most jobs that only need a way to enable/disable work unit retries. The
 *     second one gives users more flexibilities.
 * </p>
 *
 * @param state Source state
 * @return list of {@link WorkUnitState}s of previous {@link WorkUnit}s subject for retries
 */
protected List<WorkUnitState> getPreviousWorkUnitStatesForRetry(SourceState state) {
    if (Iterables.isEmpty(state.getPreviousWorkUnitStates())) {
        return ImmutableList.of();
    }
    // Determine a work unit retry policy
    WorkUnitRetryPolicy workUnitRetryPolicy;
    if (state.contains(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY)) {
        // Use the given work unit retry policy if specified
        workUnitRetryPolicy = WorkUnitRetryPolicy.forName(state.getProp(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY));
    } else {
        // Otherwise set the retry policy based on if work unit retry is enabled
        boolean retryFailedWorkUnits = state.getPropAsBoolean(ConfigurationKeys.WORK_UNIT_RETRY_ENABLED_KEY, true);
        workUnitRetryPolicy = retryFailedWorkUnits ? WorkUnitRetryPolicy.ALWAYS : WorkUnitRetryPolicy.NEVER;
    }
    if (workUnitRetryPolicy == WorkUnitRetryPolicy.NEVER) {
        return ImmutableList.of();
    }
    List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList();
    // Get previous work units that were not successfully committed (subject for retries)
    for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) {
        if (workUnitState.getWorkingState() != WorkUnitState.WorkingState.COMMITTED) {
            if (state.getPropAsBoolean(ConfigurationKeys.OVERWRITE_CONFIGS_IN_STATESTORE, ConfigurationKeys.DEFAULT_OVERWRITE_CONFIGS_IN_STATESTORE)) {
                // We need to make a copy here since getPreviousWorkUnitStates returns ImmutableWorkUnitStates
                // for which addAll is not supported
                WorkUnitState workUnitStateCopy = new WorkUnitState(workUnitState.getWorkunit(), state);
                workUnitStateCopy.addAll(workUnitState);
                workUnitStateCopy.overrideWith(state);
                previousWorkUnitStates.add(workUnitStateCopy);
            } else {
                previousWorkUnitStates.add(workUnitState);
            }
        }
    }
    if (workUnitRetryPolicy == WorkUnitRetryPolicy.ALWAYS) {
        return previousWorkUnitStates;
    }
    JobCommitPolicy jobCommitPolicy = JobCommitPolicy.forName(state.getProp(ConfigurationKeys.JOB_COMMIT_POLICY_KEY, ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY));
    if ((workUnitRetryPolicy == WorkUnitRetryPolicy.ON_COMMIT_ON_PARTIAL_SUCCESS && jobCommitPolicy == JobCommitPolicy.COMMIT_ON_PARTIAL_SUCCESS) || (workUnitRetryPolicy == WorkUnitRetryPolicy.ON_COMMIT_ON_FULL_SUCCESS && jobCommitPolicy == JobCommitPolicy.COMMIT_ON_FULL_SUCCESS)) {
        return previousWorkUnitStates;
    }
    // Return an empty list if job commit policy and work unit retry policy do not match
    return ImmutableList.of();
}
Also used : WorkUnitRetryPolicy(org.apache.gobblin.source.extractor.WorkUnitRetryPolicy) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) JobCommitPolicy(org.apache.gobblin.source.extractor.JobCommitPolicy)

Aggregations

JobCommitPolicy (org.apache.gobblin.source.extractor.JobCommitPolicy)4 ToString (lombok.ToString)2 Connection (java.sql.Connection)1 SQLException (java.sql.SQLException)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 WorkUnitRetryPolicy (org.apache.gobblin.source.extractor.WorkUnitRetryPolicy)1 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)1 JdbcWriterCommands (org.apache.gobblin.writer.commands.JdbcWriterCommands)1