Search in sources :

Example 31 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class PurgeableHivePartitionDataset method purge.

/**
 * This method is responsible for actual purging.
 *  - It first creates a staging table partition with the same schema as of original table partition.
 *  - Staging table partition is then populated by original table left outer joined with compliance id table.
 *
 *  - Alter query will then change the partition location to the staging partition location.
 *  - In flight queries won't get affected due to alter partition query.
 */
public void purge() throws IOException {
    this.datasetOwner = getOwner();
    State state = new State(this.state);
    this.datasetOwnerFs = ProxyUtils.getOwnerFs(state, this.datasetOwner);
    try (HiveProxyQueryExecutor queryExecutor = ProxyUtils.getQueryExecutor(state, this.datasetOwner)) {
        if (this.simulate) {
            log.info("Simulate is set to true. Wont't run actual queries");
            return;
        }
        String originalPartitionLocation = getOriginalPartitionLocation();
        // Create the staging table and staging partition
        queryExecutor.executeQueries(HivePurgerQueryTemplate.getCreateStagingTableQuery(this), this.datasetOwner);
        this.startTime = getLastModifiedTime(originalPartitionLocation);
        // Execute purge queries, that is insert filtered data into the staging partition
        queryExecutor.executeQueries(this.purgeQueries, this.datasetOwner);
        this.endTime = getLastModifiedTime(originalPartitionLocation);
        // Create a backup table and partition pointing to the original partition location
        queryExecutor.executeQueries(HivePurgerQueryTemplate.getBackupQueries(this), this.datasetOwner);
        String commitPolicyString = this.state.getProp(ComplianceConfigurationKeys.PURGER_COMMIT_POLICY_CLASS, ComplianceConfigurationKeys.DEFAULT_PURGER_COMMIT_POLICY_CLASS);
        CommitPolicy<PurgeableHivePartitionDataset> commitPolicy = GobblinConstructorUtils.invokeConstructor(CommitPolicy.class, commitPolicyString);
        if (!commitPolicy.shouldCommit(this)) {
            log.error("Last modified time before start of execution : " + this.startTime);
            log.error("Last modified time after execution of purge queries : " + this.endTime);
            throw new RuntimeException("Failed to commit. File modified during job run.");
        }
        // Alter the original table partition to start pointing to the cleaned-partition-location/staging-partition-location
        queryExecutor.executeQueries(HivePurgerQueryTemplate.getAlterOriginalPartitionLocationQueries(this), this.datasetOwner);
        // Drop the staging table
        queryExecutor.executeQueries(HivePurgerQueryTemplate.getDropStagingTableQuery(this), this.datasetOwner);
    } catch (SQLException e) {
        throw new IOException(e);
    }
}
Also used : SQLException(java.sql.SQLException) State(org.apache.gobblin.configuration.State) IOException(java.io.IOException) HiveProxyQueryExecutor(org.apache.gobblin.compliance.HiveProxyQueryExecutor)

Example 32 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class ComplianceRetentionJob method initDatasetFinder.

public void initDatasetFinder(Properties properties) throws IOException {
    Preconditions.checkArgument(properties.containsKey(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS), "Missing required propety " + GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
    String finderClass = properties.getProperty(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
    this.finder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, finderClass, new State(properties));
    Iterator<HiveDataset> datasetsIterator = new HiveDatasetFinder(FileSystem.newInstance(new Configuration()), properties).getDatasetsIterator();
    while (datasetsIterator.hasNext()) {
        // Drop partitions from empty tables if property is set, otherwise skip the table
        HiveDataset hiveDataset = datasetsIterator.next();
        List<Partition> partitionsFromDataset = hiveDataset.getPartitionsFromDataset();
        String completeTableName = hiveDataset.getTable().getCompleteName();
        if (!partitionsFromDataset.isEmpty()) {
            this.tableNamesList.add(completeTableName);
            continue;
        }
        if (!Boolean.parseBoolean(properties.getProperty(ComplianceConfigurationKeys.SHOULD_DROP_EMPTY_TABLES, ComplianceConfigurationKeys.DEFAULT_SHOULD_DROP_EMPTY_TABLES))) {
            continue;
        }
        if (completeTableName.contains(ComplianceConfigurationKeys.TRASH) || completeTableName.contains(ComplianceConfigurationKeys.BACKUP) || completeTableName.contains(ComplianceConfigurationKeys.STAGING)) {
            this.tablesToDrop.add(hiveDataset);
        }
    }
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) Configuration(org.apache.hadoop.conf.Configuration) State(org.apache.gobblin.configuration.State) DatasetsFinder(org.apache.gobblin.dataset.DatasetsFinder) HiveDatasetFinder(org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset)

Example 33 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class HivePartitionVersionRetentionCleaner method clean.

/**
 * If simulate is set to true, this will simply return.
 * If version is pointing to an empty location, drop the partition and close the jdbc connection.
 * If version is pointing to the same location as of the dataset, then drop the partition and close the jdbc connection.
 * If version is pointing to the non deletable version locations, then drop the partition and close the jdbc connection.
 * Otherwise delete the data underneath, drop the partition and close the jdbc connection.
 */
@Override
public void clean() throws IOException {
    Path versionLocation = ((HivePartitionRetentionVersion) this.datasetVersion).getLocation();
    Path datasetLocation = ((CleanableHivePartitionDataset) this.cleanableDataset).getLocation();
    String completeName = ((HivePartitionRetentionVersion) this.datasetVersion).datasetURN();
    State state = new State(this.state);
    this.fs = ProxyUtils.getOwnerFs(state, this.versionOwner);
    try (HiveProxyQueryExecutor queryExecutor = ProxyUtils.getQueryExecutor(state, this.versionOwner)) {
        log.info("Trying to clean version " + completeName);
        if (!this.fs.exists(versionLocation)) {
            log.info("Data versionLocation doesn't exist. Metadata will be dropped for the version  " + completeName);
        } else if (datasetLocation.toString().equalsIgnoreCase(versionLocation.toString())) {
            log.info("Dataset location is same as version location. Won't delete the data but metadata will be dropped for the version " + completeName);
        } else if (this.nonDeletableVersionLocations.contains(versionLocation.toString())) {
            log.info("This version corresponds to the non deletable version. Won't delete the data but metadata will be dropped for the version " + completeName);
        } else if (HadoopUtils.hasContent(this.fs, versionLocation)) {
            if (this.simulate) {
                log.info("Simulate is set to true. Won't delete the partition " + completeName);
                return;
            }
            log.info("Deleting data from the version " + completeName);
            this.fs.delete(versionLocation, true);
        }
        executeDropVersionQueries(queryExecutor);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) State(org.apache.gobblin.configuration.State) HiveProxyQueryExecutor(org.apache.gobblin.compliance.HiveProxyQueryExecutor)

Example 34 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class GobblinMetrics method addCustomTagToProperties.

/**
 * Add a {@link Tag} to a {@link Properties} with key {@link #METRICS_STATE_CUSTOM_TAGS}.
 * Also see {@link #addCustomTagToState(State, Tag)}
 *
 * <p>
 *   The {@link Properties} passed can be used to build a {@link State}.
 *   {@link org.apache.gobblin.metrics.Tag}s under this key can later be parsed using the method {@link #getCustomTagsFromState}.
 * </p>
 *
 * @param properties {@link Properties} to add the tag to.
 * @param tag {@link Tag} to add.
 */
public static void addCustomTagToProperties(Properties properties, Tag<?> tag) {
    // Build a state wrapper to add custom tag to property
    State state = new State(properties);
    addCustomTagToState(state, tag);
}
Also used : State(org.apache.gobblin.configuration.State)

Example 35 with State

use of org.apache.gobblin.configuration.State in project incubator-gobblin by apache.

the class RestorableHivePartitionDataset method init.

private void init(State state) {
    this.state = new State(state);
    Preconditions.checkArgument(this.state.contains(ComplianceConfigurationKeys.RESTORE_POLICY_CLASS), "Missing required property " + ComplianceConfigurationKeys.RESTORE_POLICY_CLASS);
    Preconditions.checkArgument(this.state.contains(ComplianceConfigurationKeys.TRASH_OWNER), "Missing required property " + ComplianceConfigurationKeys.TRASH_OWNER);
    String restorePolicyClass = this.state.getProp(ComplianceConfigurationKeys.RESTORE_POLICY_CLASS);
    this.datasetOwner = getOwner();
    this.trashOwner = Optional.fromNullable(this.state.getProp(ComplianceConfigurationKeys.TRASH_OWNER));
    setTimeStamp();
    this.restorePolicy = GobblinConstructorUtils.invokeConstructor(HivePartitionRestorePolicy.class, restorePolicyClass, this.state);
    try {
        this.datasetToRestore = (HivePartitionDataset) this.restorePolicy.getDatasetToRestore(this);
        log.info("Found dataset to restore with " + this.datasetToRestore.datasetURN());
    } catch (IOException e) {
        Throwables.propagate(e);
    }
    this.datasetToRestoreOwner = this.datasetToRestore.getOwner();
}
Also used : State(org.apache.gobblin.configuration.State) IOException(java.io.IOException)

Aggregations

State (org.apache.gobblin.configuration.State)195 Test (org.testng.annotations.Test)103 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)74 SourceState (org.apache.gobblin.configuration.SourceState)38 Path (org.apache.hadoop.fs.Path)30 File (java.io.File)20 IOException (java.io.IOException)16 Map (java.util.Map)14 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)14 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)13 Properties (java.util.Properties)12 FinalState (org.apache.gobblin.util.FinalState)12 Configuration (org.apache.hadoop.conf.Configuration)12 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)9 Config (com.typesafe.config.Config)8 ArrayList (java.util.ArrayList)8 GenericRecord (org.apache.avro.generic.GenericRecord)8 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 FileInputStream (java.io.FileInputStream)6