Search in sources :

Example 6 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class HiveQueryExecutionWriter method addPropsForPublisher.

/**
 * Method to add properties needed by publisher to preserve partition params
 */
private void addPropsForPublisher(QueryBasedHiveConversionEntity hiveConversionEntity) {
    if (!hiveConversionEntity.getPartition().isPresent()) {
        return;
    }
    ConvertibleHiveDataset convertibleHiveDataset = hiveConversionEntity.getConvertibleHiveDataset();
    for (String format : convertibleHiveDataset.getDestFormats()) {
        Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigForFormat = convertibleHiveDataset.getConversionConfigForFormat(format);
        if (!conversionConfigForFormat.isPresent()) {
            continue;
        }
        SchemaAwareHivePartition sourcePartition = hiveConversionEntity.getHivePartition().get();
        // Get complete source partition name dbName@tableName@partitionName
        String completeSourcePartitionName = StringUtils.join(Arrays.asList(sourcePartition.getTable().getDbName(), sourcePartition.getTable().getTableName(), sourcePartition.getName()), AT_CHAR);
        ConvertibleHiveDataset.ConversionConfig config = conversionConfigForFormat.get();
        // Get complete destination partition name dbName@tableName@partitionName
        String completeDestPartitionName = StringUtils.join(Arrays.asList(config.getDestinationDbName(), config.getDestinationTableName(), sourcePartition.getName()), AT_CHAR);
        workUnit.setProp(HiveConvertPublisher.COMPLETE_SOURCE_PARTITION_NAME, completeSourcePartitionName);
        workUnit.setProp(HiveConvertPublisher.COMPLETE_DEST_PARTITION_NAME, completeDestPartitionName);
    }
}
Also used : ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) SchemaAwareHivePartition(org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHivePartition)

Example 7 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class Avro2OrcStaleDatasetCleaner method run.

@Override
public void run() throws Exception {
    Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
    while (iterator.hasNext()) {
        ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
        try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
            Set<Partition> sourcePartitions = new HashSet<>(HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent()));
            sourcePartitions.parallelStream().filter(partition -> isUnixTimeStamp(partition.getDataLocation().getName())).forEach(partition -> {
                Arrays.stream(listFiles(partition.getDataLocation().getParent())).filter(fileStatus -> !fileStatus.getPath().toString().equalsIgnoreCase(partition.getDataLocation().toString())).forEach(fileStatus -> {
                    deletePath(fileStatus, this.graceTimeInMillis, true);
                });
            });
        }
    }
}
Also used : Arrays(java.util.Arrays) HiveUtils(org.apache.gobblin.data.management.copy.hive.HiveUtils) FileSystem(org.apache.hadoop.fs.FileSystem) MetricContext(org.apache.gobblin.metrics.MetricContext) EventConstants(org.apache.gobblin.data.management.conversion.hive.events.EventConstants) ConfigUtils(org.apache.gobblin.util.ConfigUtils) FileStatus(org.apache.hadoop.fs.FileStatus) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Optional(com.google.common.base.Optional) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ConfigFactory(com.typesafe.config.ConfigFactory) HiveDatasetFinder(org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder) ConvertibleHiveDatasetFinder(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetFinder) Properties(java.util.Properties) Iterator(java.util.Iterator) ValidationJob(org.apache.gobblin.data.management.conversion.hive.validation.ValidationJob) Config(com.typesafe.config.Config) Instrumented(org.apache.gobblin.instrumented.Instrumented) Set(java.util.Set) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) IOException(java.io.IOException) TimeUnit(java.util.concurrent.TimeUnit) Partition(org.apache.hadoop.hive.ql.metadata.Partition) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) AbstractJob(azkaban.jobExecutor.AbstractJob) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) AutoReturnableObject(org.apache.gobblin.util.AutoReturnableObject) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) HashSet(java.util.HashSet)

Example 8 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class ValidationJob method runCountValidation.

private void runCountValidation() throws InterruptedException {
    try {
        // Validation results
        this.successfulConversions = Maps.newConcurrentMap();
        this.failedConversions = Maps.newConcurrentMap();
        this.warnConversions = Maps.newConcurrentMap();
        this.dataValidationFailed = Maps.newConcurrentMap();
        this.dataValidationSuccessful = Maps.newConcurrentMap();
        // Find datasets to validate
        Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
        EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.VALIDATION_FIND_HIVE_TABLES_EVENT);
        while (iterator.hasNext()) {
            ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
            try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
                // Validate dataset
                log.info(String.format("Validating dataset: %s", hiveDataset));
                if (HiveUtils.isPartitioned(hiveDataset.getTable())) {
                    processPartitionedTable(hiveDataset, client);
                } else {
                    processNonPartitionedTable(hiveDataset);
                }
            }
        }
        // Wait for all validation queries to finish
        log.info(String.format("Waiting for %d futures to complete", this.futures.size()));
        this.exec.shutdown();
        this.exec.awaitTermination(4, TimeUnit.HOURS);
        boolean oneFutureFailure = false;
        // Check if there were any exceptions
        for (Future<Void> future : this.futures) {
            try {
                future.get();
            } catch (Throwable t) {
                log.error("getValidationOutputFromHive failed", t);
                oneFutureFailure = true;
            }
        }
        // These are then converted into log lines in the Azkaban logs as done below
        for (Map.Entry<String, String> successfulConversion : this.successfulConversions.entrySet()) {
            log.info(String.format("Successful conversion: %s [%s]", successfulConversion.getKey(), successfulConversion.getValue()));
        }
        for (Map.Entry<String, String> successfulConversion : this.warnConversions.entrySet()) {
            log.warn(String.format("No conversion found for: %s [%s]", successfulConversion.getKey(), successfulConversion.getValue()));
        }
        for (Map.Entry<String, String> failedConverion : this.failedConversions.entrySet()) {
            log.error(String.format("Failed conversion: %s [%s]", failedConverion.getKey(), failedConverion.getValue()));
        }
        for (Map.Entry<String, String> success : this.dataValidationSuccessful.entrySet()) {
            log.info(String.format("Data validation successful: %s [%s]", success.getKey(), success.getValue()));
        }
        for (Map.Entry<String, String> failed : this.dataValidationFailed.entrySet()) {
            log.error(String.format("Data validation failed: %s [%s]", failed.getKey(), failed.getValue()));
        }
        if (!this.failedConversions.isEmpty() || !this.dataValidationFailed.isEmpty()) {
            throw new RuntimeException(String.format("Validation failed for %s conversions. See previous logs for exact validation failures", failedConversions.size()));
        }
        if (oneFutureFailure) {
            throw new RuntimeException("At least one hive ddl failed. Check previous logs");
        }
    } catch (IOException e) {
        Throwables.propagate(e);
    }
}
Also used : ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) IOException(java.io.IOException) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Aggregations

ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)8 Config (com.typesafe.config.Config)3 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)3 Partition (org.apache.hadoop.hive.ql.metadata.Partition)3 Optional (com.google.common.base.Optional)2 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)2 IOException (java.io.IOException)2 Schema (org.apache.avro.Schema)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)2 ConvertibleHiveDatasetTest (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetTest)2 QueryBasedHiveConversionEntity (org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHiveConversionEntity)2 SchemaAwareHiveTable (org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable)2 HiveDataset (org.apache.gobblin.data.management.copy.hive.HiveDataset)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 Path (org.apache.hadoop.fs.Path)2 IMetaStoreClient (org.apache.hadoop.hive.metastore.IMetaStoreClient)2 AbstractJob (azkaban.jobExecutor.AbstractJob)1 ImmutableMap (com.google.common.collect.ImmutableMap)1