Search in sources :

Example 1 with HivePartitionDataset

use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.

the class HivePurgerExtractor method createPurgeableHivePartitionDataset.

private PurgeableHivePartitionDataset createPurgeableHivePartitionDataset(State state) throws IOException {
    HivePartitionDataset hivePartitionDataset = HivePartitionFinder.findDataset(state.getProp(ComplianceConfigurationKeys.PARTITION_NAME), state);
    Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCEID_KEY), "Missing property " + ComplianceConfigurationKeys.COMPLIANCEID_KEY);
    Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY), "Missing property " + ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY);
    Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.TIMESTAMP), "Missing table property " + ComplianceConfigurationKeys.TIMESTAMP);
    Boolean simulate = state.getPropAsBoolean(ComplianceConfigurationKeys.COMPLIANCE_JOB_SIMULATE, ComplianceConfigurationKeys.DEFAULT_COMPLIANCE_JOB_SIMULATE);
    String complianceIdentifier = state.getProp(ComplianceConfigurationKeys.COMPLIANCEID_KEY);
    String complianceIdTable = state.getProp(ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY);
    String timeStamp = state.getProp(ComplianceConfigurationKeys.TIMESTAMP);
    Boolean specifyPartitionFormat = state.getPropAsBoolean(ComplianceConfigurationKeys.SPECIFY_PARTITION_FORMAT, ComplianceConfigurationKeys.DEFAULT_SPECIFY_PARTITION_FORMAT);
    State datasetState = new State();
    datasetState.addAll(state.getProperties());
    PurgeableHivePartitionDataset dataset = new PurgeableHivePartitionDataset(hivePartitionDataset);
    dataset.setComplianceId(complianceIdentifier);
    dataset.setComplianceIdTable(complianceIdTable);
    dataset.setComplianceField(getComplianceField(state, hivePartitionDataset));
    dataset.setTimeStamp(timeStamp);
    dataset.setState(datasetState);
    dataset.setSimulate(simulate);
    dataset.setSpecifyPartitionFormat(specifyPartitionFormat);
    return dataset;
}
Also used : HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State)

Example 2 with HivePartitionDataset

use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.

the class HivePurgerPublisher method submitEvent.

private void submitEvent(WorkUnitState state, String name) {
    WorkUnit workUnit = state.getWorkunit();
    Map<String, String> metadata = new HashMap<>();
    String recordsRead = state.getProp(ComplianceConfigurationKeys.NUM_ROWS);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSREAD, recordsRead);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESREAD, getDataSize(workUnit.getProp(ComplianceConfigurationKeys.RAW_DATA_SIZE), workUnit.getProp(ComplianceConfigurationKeys.TOTAL_SIZE)));
    String partitionNameProp = workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME);
    Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
    List<String> namesList = AT_SPLITTER.splitToList(partitionNameProp);
    if (namesList.size() != 3) {
        log.warn("Not submitting event. Invalid partition name: " + partitionNameProp);
        return;
    }
    String dbName = namesList.get(0), tableName = namesList.get(1), partitionName = namesList.get(2);
    org.apache.hadoop.hive.metastore.api.Partition apiPartition = null;
    Partition qlPartition = null;
    try {
        Table table = new Table(this.client.getTable(dbName, tableName));
        apiPartition = this.client.getPartition(dbName, tableName, partitionName);
        qlPartition = new Partition(table, apiPartition);
    } catch (Exception e) {
        log.warn("Not submitting event. Failed to resolve partition '" + partitionName + "': " + e);
        e.printStackTrace();
        return;
    }
    HivePartitionDataset hivePartitionDataset = new HivePartitionDataset(qlPartition);
    String recordsWritten = DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.NUM_ROWS, ComplianceConfigurationKeys.DEFAULT_NUM_ROWS);
    String recordsPurged = Long.toString((Long.parseLong(recordsRead) - Long.parseLong(recordsWritten)));
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSWRITTEN, recordsWritten);
    metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESWRITTEN, getDataSize(DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.RAW_DATA_SIZE, ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE), DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.TOTAL_SIZE, ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE)));
    metadata.put(DatasetMetrics.DATABASE_NAME, hivePartitionDataset.getDbName());
    metadata.put(DatasetMetrics.TABLE_NAME, hivePartitionDataset.getTableName());
    metadata.put(DatasetMetrics.PARTITION_NAME, hivePartitionDataset.getName());
    metadata.put(DatasetMetrics.RECORDS_PURGED, recordsPurged);
    this.eventSubmitter.submit(name, metadata);
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) Splitter(com.google.common.base.Splitter) Table(org.apache.hadoop.hive.ql.metadata.Table) HashMap(java.util.HashMap) TException(org.apache.thrift.TException) HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 3 with HivePartitionDataset

use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.

the class HivePurgerSource method createWorkUnits.

/**
 * This method creates the list of all work units needed for the current execution.
 * Fresh work units are created for each partition starting from watermark and failed work units from the
 * previous run will be added to the list.
 */
protected void createWorkUnits(SourceState state) throws IOException {
    createWorkunitsFromPreviousState(state);
    if (this.datasets.isEmpty()) {
        return;
    }
    for (HivePartitionDataset dataset : this.datasets) {
        Optional<String> owner = dataset.getOwner();
        if (workUnitsExceeded()) {
            log.info("Workunits exceeded");
            setJobWatermark(state, dataset.datasetURN());
            return;
        }
        if (!this.policy.shouldPurge(dataset)) {
            continue;
        }
        WorkUnit workUnit = createNewWorkUnit(dataset);
        log.info("Created new work unit with partition " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME));
        this.workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit);
        this.workUnitsCreatedCount++;
    }
    if (!state.contains(ComplianceConfigurationKeys.HIVE_PURGER_WATERMARK)) {
        this.setJobWatermark(state, ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK);
    }
}
Also used : HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 4 with HivePartitionDataset

use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.

the class AdhocRestorePolicy method getDatasetToRestore.

/**
 * @param dataset to restore
 * @return dataset to restore with
 */
public HivePartitionDataset getDatasetToRestore(HivePartitionDataset dataset) throws IOException {
    Preconditions.checkArgument(this.state.contains(ComplianceConfigurationKeys.DATASET_TO_RESTORE), "Missing required property " + ComplianceConfigurationKeys.DATASET_TO_RESTORE);
    HivePartitionDataset hivePartitionDataset = HivePartitionFinder.findDataset(this.state.getProp(ComplianceConfigurationKeys.DATASET_TO_RESTORE), this.state);
    FileSystem fs = ProxyUtils.getOwnerFs(new State(this.state), hivePartitionDataset.getOwner());
    Preconditions.checkArgument(HadoopUtils.hasContent(fs, hivePartitionDataset.getLocation()), "Dataset to restore doesn't have any data");
    return hivePartitionDataset;
}
Also used : HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) State(org.apache.gobblin.configuration.State) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 5 with HivePartitionDataset

use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.

the class LKGRestorePolicy method getDatasetToRestore.

/**
 * @param dataset to restore
 * @return most recent restorable dataset
 */
public HivePartitionDataset getDatasetToRestore(HivePartitionDataset dataset) throws IOException {
    List<String> patterns = new ArrayList<>();
    patterns.add(getCompleteTableName(dataset) + ComplianceConfigurationKeys.BACKUP);
    HivePartitionVersionFinder finder = new HivePartitionVersionFinder(WriterUtils.getWriterFs(new State(this.state)), this.state, patterns);
    List<HivePartitionVersion> versions = new ArrayList<>(finder.findDatasetVersions(dataset));
    Preconditions.checkArgument(!versions.isEmpty(), "No versions to restore dataset " + dataset.datasetURN());
    List<HivePartitionVersion> nonRestorableVersions = new ArrayList<>();
    for (HivePartitionVersion version : versions) {
        if (!isRestorable(dataset, version)) {
            nonRestorableVersions.add(version);
        }
    }
    versions.removeAll(nonRestorableVersions);
    Preconditions.checkArgument(!versions.isEmpty(), "No versions to restore dataset " + dataset.datasetURN());
    Collections.sort(versions);
    // return the most recent restorable version
    return new HivePartitionDataset(versions.get(0));
}
Also used : HivePartitionVersion(org.apache.gobblin.compliance.HivePartitionVersion) HivePartitionDataset(org.apache.gobblin.compliance.HivePartitionDataset) State(org.apache.gobblin.configuration.State) ArrayList(java.util.ArrayList) HivePartitionVersionFinder(org.apache.gobblin.compliance.HivePartitionVersionFinder)

Aggregations

HivePartitionDataset (org.apache.gobblin.compliance.HivePartitionDataset)7 State (org.apache.gobblin.configuration.State)3 ArrayList (java.util.ArrayList)2 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)2 Splitter (com.google.common.base.Splitter)1 HashMap (java.util.HashMap)1 HivePartitionVersion (org.apache.gobblin.compliance.HivePartitionVersion)1 HivePartitionVersionFinder (org.apache.gobblin.compliance.HivePartitionVersionFinder)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 TException (org.apache.thrift.TException)1