Search in sources :

Example 1 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class HiveSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    try {
        this.beginGetWorkunitsTime = System.currentTimeMillis();
        initialize(state);
        EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT);
        Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
        while (iterator.hasNext()) {
            HiveDataset hiveDataset = iterator.next();
            try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
                log.debug(String.format("Processing dataset: %s", hiveDataset));
                // Create workunits for partitions
                if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) {
                    createWorkunitsForPartitionedTable(hiveDataset, client);
                } else {
                    createWorkunitForNonPartitionedTable(hiveDataset);
                }
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    int realWorkunits = this.workunits.size();
    this.watermarker.onGetWorkunitsEnd(this.workunits);
    log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits)));
    return this.workunits;
}
Also used : ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) IOException(java.io.IOException) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient)

Example 2 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class SimpleHiveDatasetTieringPrioritizerTest method getRequestor.

private CopyableDatasetRequestor getRequestor(String dbName, String tableName) {
    CopyableDatasetRequestor requestor = Mockito.mock(CopyableDatasetRequestor.class);
    HiveDataset dataset = Mockito.mock(HiveDataset.class);
    Table table = new Table(new org.apache.hadoop.hive.metastore.api.Table());
    table.setDbName(dbName);
    table.setTableName(tableName);
    Mockito.when(dataset.getTable()).thenReturn(table);
    Mockito.when(requestor.getDataset()).thenReturn(dataset);
    return requestor;
}
Also used : Table(org.apache.hadoop.hive.ql.metadata.Table) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) CopyableDatasetRequestor(org.apache.gobblin.data.management.partition.CopyableDatasetRequestor)

Example 3 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class HiveMaterializerSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    try {
        FileSystem fs = HadoopUtils.getSourceFileSystem(state);
        Config config = ConfigUtils.propertiesToConfig(state.getProperties());
        if (state.contains(COPY_TABLE_KEY)) {
            HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state);
            WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
            HiveTask.disableHiveWatermarker(workUnit);
            return Lists.newArrayList(workUnit);
        } else if (state.contains(MATERIALIZE_VIEW)) {
            HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state);
            WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
            HiveTask.disableHiveWatermarker(workUnit);
            return Lists.newArrayList(workUnit);
        } else if (state.contains(MATERIALIZE_QUERY)) {
            String query = state.getProp(MATERIALIZE_QUERY);
            WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null));
            HiveTask.disableHiveWatermarker(workUnit);
            return Lists.newArrayList(workUnit);
        }
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
    throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW));
}
Also used : Config(com.typesafe.config.Config) FileSystem(org.apache.hadoop.fs.FileSystem) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) IOException(java.io.IOException) StageableTableMetadata(org.apache.gobblin.data.management.conversion.hive.entities.StageableTableMetadata)

Example 4 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class ComplianceRetentionJob method initDatasetFinder.

public void initDatasetFinder(Properties properties) throws IOException {
    Preconditions.checkArgument(properties.containsKey(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS), "Missing required propety " + GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
    String finderClass = properties.getProperty(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS);
    this.finder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, finderClass, new State(properties));
    Iterator<HiveDataset> datasetsIterator = new HiveDatasetFinder(FileSystem.newInstance(new Configuration()), properties).getDatasetsIterator();
    while (datasetsIterator.hasNext()) {
        // Drop partitions from empty tables if property is set, otherwise skip the table
        HiveDataset hiveDataset = datasetsIterator.next();
        List<Partition> partitionsFromDataset = hiveDataset.getPartitionsFromDataset();
        String completeTableName = hiveDataset.getTable().getCompleteName();
        if (!partitionsFromDataset.isEmpty()) {
            this.tableNamesList.add(completeTableName);
            continue;
        }
        if (!Boolean.parseBoolean(properties.getProperty(ComplianceConfigurationKeys.SHOULD_DROP_EMPTY_TABLES, ComplianceConfigurationKeys.DEFAULT_SHOULD_DROP_EMPTY_TABLES))) {
            continue;
        }
        if (completeTableName.contains(ComplianceConfigurationKeys.TRASH) || completeTableName.contains(ComplianceConfigurationKeys.BACKUP) || completeTableName.contains(ComplianceConfigurationKeys.STAGING)) {
            this.tablesToDrop.add(hiveDataset);
        }
    }
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) Configuration(org.apache.hadoop.conf.Configuration) State(org.apache.gobblin.configuration.State) DatasetsFinder(org.apache.gobblin.dataset.DatasetsFinder) HiveDatasetFinder(org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset)

Example 5 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class ComplianceRetentionJob method run.

public void run() throws IOException {
    // Dropping empty tables
    for (HiveDataset dataset : this.tablesToDrop) {
        log.info("Dropping table: " + dataset.getTable().getCompleteName());
        executeDropTableQuery(dataset, this.properties);
    }
    Preconditions.checkNotNull(this.finder, "Dataset finder class is not set");
    List<Dataset> datasets = this.finder.findDatasets();
    this.finishCleanSignal = Optional.of(new CountDownLatch(datasets.size()));
    for (final Dataset dataset : datasets) {
        ListenableFuture<Void> future = this.service.submit(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                if (dataset instanceof CleanableDataset) {
                    ((CleanableDataset) dataset).clean();
                } else {
                    log.warn("Not an instance of " + CleanableDataset.class + " Dataset won't be cleaned " + dataset.datasetURN());
                }
                return null;
            }
        });
        Futures.addCallback(future, new FutureCallback<Void>() {

            @Override
            public void onSuccess(@Nullable Void result) {
                ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
                log.info("Successfully cleaned: " + dataset.datasetURN());
            }

            @Override
            public void onFailure(Throwable t) {
                ComplianceRetentionJob.this.finishCleanSignal.get().countDown();
                log.warn("Exception caught when cleaning " + dataset.datasetURN() + ".", t);
                ComplianceRetentionJob.this.throwables.add(t);
                ComplianceRetentionJob.this.eventSubmitter.submit(ComplianceEvents.Retention.FAILED_EVENT_NAME, ImmutableMap.of(ComplianceEvents.FAILURE_CONTEXT_METADATA_KEY, ExceptionUtils.getFullStackTrace(t), ComplianceEvents.DATASET_URN_METADATA_KEY, dataset.datasetURN()));
            }
        });
    }
}
Also used : CleanableDataset(org.apache.gobblin.data.management.retention.dataset.CleanableDataset) Dataset(org.apache.gobblin.dataset.Dataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) CleanableDataset(org.apache.gobblin.data.management.retention.dataset.CleanableDataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) CountDownLatch(java.util.concurrent.CountDownLatch) SQLException(java.sql.SQLException) TException(org.apache.thrift.TException) IOException(java.io.IOException)

Aggregations

HiveDataset (org.apache.gobblin.data.management.copy.hive.HiveDataset)14 IMetaStoreClient (org.apache.hadoop.hive.metastore.IMetaStoreClient)8 IOException (java.io.IOException)6 Configuration (org.apache.hadoop.conf.Configuration)6 Table (org.apache.hadoop.hive.ql.metadata.Table)5 Properties (java.util.Properties)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 ArrayList (java.util.ArrayList)3 ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)3 HiveDatasetFinder (org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Config (com.typesafe.config.Config)2 File (java.io.File)2 List (java.util.List)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Path (org.apache.hadoop.fs.Path)2 TException (org.apache.thrift.TException)2 Test (org.testng.annotations.Test)2 AbstractJob (azkaban.jobExecutor.AbstractJob)1 Optional (com.google.common.base.Optional)1