use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class HiveSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
try {
this.beginGetWorkunitsTime = System.currentTimeMillis();
initialize(state);
EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT);
Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
while (iterator.hasNext()) {
HiveDataset hiveDataset = iterator.next();
try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
log.debug(String.format("Processing dataset: %s", hiveDataset));
// Create workunits for partitions
if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) {
createWorkunitsForPartitionedTable(hiveDataset, client);
} else {
createWorkunitForNonPartitionedTable(hiveDataset);
}
}
}
} catch (IOException e) {
throw new RuntimeException(e);
}
int realWorkunits = this.workunits.size();
this.watermarker.onGetWorkunitsEnd(this.workunits);
log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits)));
return this.workunits;
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class SimpleHiveDatasetTieringPrioritizerTest method getRequestor.
private CopyableDatasetRequestor getRequestor(String dbName, String tableName) {
CopyableDatasetRequestor requestor = Mockito.mock(CopyableDatasetRequestor.class);
HiveDataset dataset = Mockito.mock(HiveDataset.class);
Table table = new Table(new org.apache.hadoop.hive.metastore.api.Table());
table.setDbName(dbName);
table.setTableName(tableName);
Mockito.when(dataset.getTable()).thenReturn(table);
Mockito.when(requestor.getDataset()).thenReturn(dataset);
return requestor;
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class HiveMaterializerSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
try {
FileSystem fs = HadoopUtils.getSourceFileSystem(state);
Config config = ConfigUtils.propertiesToConfig(state.getProperties());
if (state.contains(COPY_TABLE_KEY)) {
HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state);
WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
HiveTask.disableHiveWatermarker(workUnit);
return Lists.newArrayList(workUnit);
} else if (state.contains(MATERIALIZE_VIEW)) {
HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state);
WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
HiveTask.disableHiveWatermarker(workUnit);
return Lists.newArrayList(workUnit);
} else if (state.contains(MATERIALIZE_QUERY)) {
String query = state.getProp(MATERIALIZE_QUERY);
WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null));
HiveTask.disableHiveWatermarker(workUnit);
return Lists.newArrayList(workUnit);
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW));
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class ComplianceRetentionJob method initDatasetFinder.
public void initDatasetFinder(Properties properties) throws IOException {
Preconditions.checkArgument(properties.containsKey(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS), "Missing required propety " + GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
String finderClass = properties.getProperty(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
this.finder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, finderClass, new State(properties));
Iterator<HiveDataset> datasetsIterator = new HiveDatasetFinder(FileSystem.newInstance(new Configuration()), properties).getDatasetsIterator();
while (datasetsIterator.hasNext()) {
// Drop partitions from empty tables if property is set, otherwise skip the table
HiveDataset hiveDataset = datasetsIterator.next();
List<Partition> partitionsFromDataset = hiveDataset.getPartitionsFromDataset();
String completeTableName = hiveDataset.getTable().getCompleteName();
if (!partitionsFromDataset.isEmpty()) {
this.tableNamesList.add(completeTableName);
continue;
}
if (!Boolean.parseBoolean(properties.getProperty(ComplianceConfigurationKeys.SHOULD_DROP_EMPTY_TABLES, ComplianceConfigurationKeys.DEFAULT_SHOULD_DROP_EMPTY_TABLES))) {
continue;
}
if (completeTableName.contains(ComplianceConfigurationKeys.TRASH) || completeTableName.contains(ComplianceConfigurationKeys.BACKUP) || completeTableName.contains(ComplianceConfigurationKeys.STAGING)) {
this.tablesToDrop.add(hiveDataset);
}
}
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class ComplianceRetentionJob method run.
public void run() throws IOException {
// Dropping empty tables
for (HiveDataset dataset : this.tablesToDrop) {
log.info("Dropping table: " + dataset.getTable().getCompleteName());
executeDropTableQuery(dataset, this.properties);
}
Preconditions.checkNotNull(this.finder, "Dataset finder class is not set");
List<Dataset> datasets = this.finder.findDatasets();
this.finishCleanSignal = Optional.of(new CountDownLatch(datasets.size()));
for (final Dataset dataset : datasets) {
ListenableFuture<Void> future = this.service.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
if (dataset instanceof CleanableDataset) {
((CleanableDataset) dataset).clean();
} else {
log.warn("Not an instance of " + CleanableDataset.class + " Dataset won't be cleaned " + dataset.datasetURN());
}
return null;
}
});
Futures.addCallback(future, new FutureCallback<Void>() {
@Override
public void onSuccess(@Nullable Void result) {
ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
log.info("Successfully cleaned: " + dataset.datasetURN());
}
@Override
public void onFailure(Throwable t) {
ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
log.warn("Exception caught when cleaning " + dataset.datasetURN() + ".", t);
ComplianceRetentionJob.this.throwables.add(t);
ComplianceRetentionJob.this.eventSubmitter.submit(ComplianceEvents.Retention.FAILED_EVENT_NAME, ImmutableMap.of(ComplianceEvents.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(t), ComplianceEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
}
});
}
}
Aggregations