use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.
the class HivePurgerExtractor method createPurgeableHivePartitionDataset.
private PurgeableHivePartitionDataset createPurgeableHivePartitionDataset(State state) throws IOException {
HivePartitionDataset hivePartitionDataset = HivePartitionFinder.findDataset(state.getProp(ComplianceConfigurationKeys.PARTITION_NAME), state);
Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCEID_KEY), "Missing property " + ComplianceConfigurationKeys.COMPLIANCEID_KEY);
Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY), "Missing property " + ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY);
Preconditions.checkArgument(state.contains(ComplianceConfigurationKeys.TIMESTAMP), "Missing table property " + ComplianceConfigurationKeys.TIMESTAMP);
Boolean simulate = state.getPropAsBoolean(ComplianceConfigurationKeys.COMPLIANCE_JOB_SIMULATE, ComplianceConfigurationKeys.DEFAULT_COMPLIANCE_JOB_SIMULATE);
String complianceIdentifier = state.getProp(ComplianceConfigurationKeys.COMPLIANCEID_KEY);
String complianceIdTable = state.getProp(ComplianceConfigurationKeys.COMPLIANCE_ID_TABLE_KEY);
String timeStamp = state.getProp(ComplianceConfigurationKeys.TIMESTAMP);
Boolean specifyPartitionFormat = state.getPropAsBoolean(ComplianceConfigurationKeys.SPECIFY_PARTITION_FORMAT, ComplianceConfigurationKeys.DEFAULT_SPECIFY_PARTITION_FORMAT);
State datasetState = new State();
datasetState.addAll(state.getProperties());
PurgeableHivePartitionDataset dataset = new PurgeableHivePartitionDataset(hivePartitionDataset);
dataset.setComplianceId(complianceIdentifier);
dataset.setComplianceIdTable(complianceIdTable);
dataset.setComplianceField(getComplianceField(state, hivePartitionDataset));
dataset.setTimeStamp(timeStamp);
dataset.setState(datasetState);
dataset.setSimulate(simulate);
dataset.setSpecifyPartitionFormat(specifyPartitionFormat);
return dataset;
}
use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.
the class HivePurgerPublisher method submitEvent.
private void submitEvent(WorkUnitState state, String name) {
WorkUnit workUnit = state.getWorkunit();
Map<String, String> metadata = new HashMap<>();
String recordsRead = state.getProp(ComplianceConfigurationKeys.NUM_ROWS);
metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSREAD, recordsRead);
metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESREAD, getDataSize(workUnit.getProp(ComplianceConfigurationKeys.RAW_DATA_SIZE), workUnit.getProp(ComplianceConfigurationKeys.TOTAL_SIZE)));
String partitionNameProp = workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME);
Splitter AT_SPLITTER = Splitter.on("@").omitEmptyStrings().trimResults();
List<String> namesList = AT_SPLITTER.splitToList(partitionNameProp);
if (namesList.size() != 3) {
log.warn("Not submitting event. Invalid partition name: " + partitionNameProp);
return;
}
String dbName = namesList.get(0), tableName = namesList.get(1), partitionName = namesList.get(2);
org.apache.hadoop.hive.metastore.api.Partition apiPartition = null;
Partition qlPartition = null;
try {
Table table = new Table(this.client.getTable(dbName, tableName));
apiPartition = this.client.getPartition(dbName, tableName, partitionName);
qlPartition = new Partition(table, apiPartition);
} catch (Exception e) {
log.warn("Not submitting event. Failed to resolve partition '" + partitionName + "': " + e);
e.printStackTrace();
return;
}
HivePartitionDataset hivePartitionDataset = new HivePartitionDataset(qlPartition);
String recordsWritten = DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.NUM_ROWS, ComplianceConfigurationKeys.DEFAULT_NUM_ROWS);
String recordsPurged = Long.toString((Long.parseLong(recordsRead) - Long.parseLong(recordsWritten)));
metadata.put(ComplianceConfigurationKeys.WORKUNIT_RECORDSWRITTEN, recordsWritten);
metadata.put(ComplianceConfigurationKeys.WORKUNIT_BYTESWRITTEN, getDataSize(DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.RAW_DATA_SIZE, ComplianceConfigurationKeys.DEFAULT_RAW_DATA_SIZE), DatasetUtils.getProperty(hivePartitionDataset, ComplianceConfigurationKeys.TOTAL_SIZE, ComplianceConfigurationKeys.DEFAULT_TOTAL_SIZE)));
metadata.put(DatasetMetrics.DATABASE_NAME, hivePartitionDataset.getDbName());
metadata.put(DatasetMetrics.TABLE_NAME, hivePartitionDataset.getTableName());
metadata.put(DatasetMetrics.PARTITION_NAME, hivePartitionDataset.getName());
metadata.put(DatasetMetrics.RECORDS_PURGED, recordsPurged);
this.eventSubmitter.submit(name, metadata);
}
use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.
the class HivePurgerSource method createWorkUnits.
/**
* This method creates the list of all work units needed for the current execution.
* Fresh work units are created for each partition starting from watermark and failed work units from the
* previous run will be added to the list.
*/
protected void createWorkUnits(SourceState state) throws IOException {
createWorkunitsFromPreviousState(state);
if (this.datasets.isEmpty()) {
return;
}
for (HivePartitionDataset dataset : this.datasets) {
Optional<String> owner = dataset.getOwner();
if (workUnitsExceeded()) {
log.info("Workunits exceeded");
setJobWatermark(state, dataset.datasetURN());
return;
}
if (!this.policy.shouldPurge(dataset)) {
continue;
}
WorkUnit workUnit = createNewWorkUnit(dataset);
log.info("Created new work unit with partition " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME));
this.workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit);
this.workUnitsCreatedCount++;
}
if (!state.contains(ComplianceConfigurationKeys.HIVE_PURGER_WATERMARK)) {
this.setJobWatermark(state, ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK);
}
}
use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.
the class AdhocRestorePolicy method getDatasetToRestore.
/**
* @param dataset to restore
* @return dataset to restore with
*/
public HivePartitionDataset getDatasetToRestore(HivePartitionDataset dataset) throws IOException {
Preconditions.checkArgument(this.state.contains(ComplianceConfigurationKeys.DATASET_TO_RESTORE), "Missing required property " + ComplianceConfigurationKeys.DATASET_TO_RESTORE);
HivePartitionDataset hivePartitionDataset = HivePartitionFinder.findDataset(this.state.getProp(ComplianceConfigurationKeys.DATASET_TO_RESTORE), this.state);
FileSystem fs = ProxyUtils.getOwnerFs(new State(this.state), hivePartitionDataset.getOwner());
Preconditions.checkArgument(HadoopUtils.hasContent(fs, hivePartitionDataset.getLocation()), "Dataset to restore doesn't have any data");
return hivePartitionDataset;
}
use of org.apache.gobblin.compliance.HivePartitionDataset in project incubator-gobblin by apache.
the class LKGRestorePolicy method getDatasetToRestore.
/**
* @param dataset to restore
* @return most recent restorable dataset
*/
public HivePartitionDataset getDatasetToRestore(HivePartitionDataset dataset) throws IOException {
List<String> patterns = new ArrayList<>();
patterns.add(getCompleteTableName(dataset) + ComplianceConfigurationKeys.BACKUP);
HivePartitionVersionFinder finder = new HivePartitionVersionFinder(WriterUtils.getWriterFs(new State(this.state)), this.state, patterns);
List<HivePartitionVersion> versions = new ArrayList<>(finder.findDatasetVersions(dataset));
Preconditions.checkArgument(!versions.isEmpty(), "No versions to restore dataset " + dataset.datasetURN());
List<HivePartitionVersion> nonRestorableVersions = new ArrayList<>();
for (HivePartitionVersion version : versions) {
if (!isRestorable(dataset, version)) {
nonRestorableVersions.add(version);
}
}
versions.removeAll(nonRestorableVersions);
Preconditions.checkArgument(!versions.isEmpty(), "No versions to restore dataset " + dataset.datasetURN());
Collections.sort(versions);
// return the most recent restorable version
return new HivePartitionDataset(versions.get(0));
}
Aggregations