Search in sources :

Example 11 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class RegistrationTimeSkipPredicateTest method test.

@Test
public void test() throws Exception {
    Path partition1Path = new Path("/path/to/partition1");
    long modTime = 100000;
    CopyContext copyContext = new CopyContext();
    CopyConfiguration copyConfiguration = Mockito.mock(CopyConfiguration.class);
    Mockito.doReturn(copyContext).when(copyConfiguration).getCopyContext();
    HiveDataset dataset = Mockito.mock(HiveDataset.class);
    FileSystem fs = Mockito.spy(FileSystem.getLocal(new Configuration()));
    FileStatus status = new FileStatus(1, false, 1, 1, modTime, partition1Path);
    Path qualifiedPath = fs.makeQualified(partition1Path);
    Mockito.doReturn(status).when(fs).getFileStatus(qualifiedPath);
    Mockito.doReturn(status).when(fs).getFileStatus(partition1Path);
    Mockito.doReturn(fs).when(dataset).getFs();
    HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
    Mockito.doReturn(copyConfiguration).when(helper).getConfiguration();
    Mockito.doReturn(dataset).when(helper).getDataset();
    RegistrationTimeSkipPredicate predicate = new RegistrationTimeSkipPredicate(helper);
    // partition exists, but registration time before modtime => don't skip
    HivePartitionFileSet pc = createPartitionCopy(partition1Path, modTime - 1, true);
    Assert.assertFalse(predicate.apply(pc));
    // partition exists, registration time equal modtime => don't skip
    pc = createPartitionCopy(partition1Path, modTime, true);
    Assert.assertFalse(predicate.apply(pc));
    // partition exists, registration time larger modtime => do skip
    pc = createPartitionCopy(partition1Path, modTime + 1, true);
    Assert.assertTrue(predicate.apply(pc));
    // partition doesn't exist => don't skip
    pc = createPartitionCopy(partition1Path, modTime + 1, false);
    Assert.assertFalse(predicate.apply(pc));
    // partition exists but is not annotated => don't skip
    pc = createPartitionCopy(partition1Path, modTime + 1, true);
    pc.getExistingTargetPartition().get().getParameters().clear();
    Assert.assertFalse(predicate.apply(pc));
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) CopyContext(org.apache.gobblin.data.management.copy.CopyContext) HivePartitionFileSet(org.apache.gobblin.data.management.copy.hive.HivePartitionFileSet) HiveCopyEntityHelper(org.apache.gobblin.data.management.copy.hive.HiveCopyEntityHelper) Test(org.testng.annotations.Test)

Example 12 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class Avro2OrcStaleDatasetCleaner method run.

@Override
public void run() throws Exception {
    Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
    while (iterator.hasNext()) {
        ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
        try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
            Set<Partition> sourcePartitions = new HashSet<>(HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent()));
            sourcePartitions.parallelStream().filter(partition -> isUnixTimeStamp(partition.getDataLocation().getName())).forEach(partition -> {
                Arrays.stream(listFiles(partition.getDataLocation().getParent())).filter(fileStatus -> !fileStatus.getPath().toString().equalsIgnoreCase(partition.getDataLocation().toString())).forEach(fileStatus -> {
                    deletePath(fileStatus, this.graceTimeInMillis, true);
                });
            });
        }
    }
}
Also used : Arrays(java.util.Arrays) HiveUtils(org.apache.gobblin.data.management.copy.hive.HiveUtils) FileSystem(org.apache.hadoop.fs.FileSystem) MetricContext(org.apache.gobblin.metrics.MetricContext) EventConstants(org.apache.gobblin.data.management.conversion.hive.events.EventConstants) ConfigUtils(org.apache.gobblin.util.ConfigUtils) FileStatus(org.apache.hadoop.fs.FileStatus) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Optional(com.google.common.base.Optional) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) ConfigFactory(com.typesafe.config.ConfigFactory) HiveDatasetFinder(org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder) ConvertibleHiveDatasetFinder(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetFinder) Properties(java.util.Properties) Iterator(java.util.Iterator) ValidationJob(org.apache.gobblin.data.management.conversion.hive.validation.ValidationJob) Config(com.typesafe.config.Config) Instrumented(org.apache.gobblin.instrumented.Instrumented) Set(java.util.Set) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) IOException(java.io.IOException) TimeUnit(java.util.concurrent.TimeUnit) Partition(org.apache.hadoop.hive.ql.metadata.Partition) EventSubmitter(org.apache.gobblin.metrics.event.EventSubmitter) AbstractJob(azkaban.jobExecutor.AbstractJob) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) AutoReturnableObject(org.apache.gobblin.util.AutoReturnableObject) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) HashSet(java.util.HashSet)

Example 13 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class ValidationJob method runCountValidation.

private void runCountValidation() throws InterruptedException {
    try {
        // Validation results
        this.successfulConversions = Maps.newConcurrentMap();
        this.failedConversions = Maps.newConcurrentMap();
        this.warnConversions = Maps.newConcurrentMap();
        this.dataValidationFailed = Maps.newConcurrentMap();
        this.dataValidationSuccessful = Maps.newConcurrentMap();
        // Find datasets to validate
        Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
        EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.VALIDATION_FIND_HIVE_TABLES_EVENT);
        while (iterator.hasNext()) {
            ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
            try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
                // Validate dataset
                log.info(String.format("Validating dataset: %s", hiveDataset));
                if (HiveUtils.isPartitioned(hiveDataset.getTable())) {
                    processPartitionedTable(hiveDataset, client);
                } else {
                    processNonPartitionedTable(hiveDataset);
                }
            }
        }
        // Wait for all validation queries to finish
        log.info(String.format("Waiting for %d futures to complete", this.futures.size()));
        this.exec.shutdown();
        this.exec.awaitTermination(4, TimeUnit.HOURS);
        boolean oneFutureFailure = false;
        // Check if there were any exceptions
        for (Future<Void> future : this.futures) {
            try {
                future.get();
            } catch (Throwable t) {
                log.error("getValidationOutputFromHive failed", t);
                oneFutureFailure = true;
            }
        }
        // These are then converted into log lines in the Azkaban logs as done below
        for (Map.Entry<String, String> successfulConversion : this.successfulConversions.entrySet()) {
            log.info(String.format("Successful conversion: %s [%s]", successfulConversion.getKey(), successfulConversion.getValue()));
        }
        for (Map.Entry<String, String> successfulConversion : this.warnConversions.entrySet()) {
            log.warn(String.format("No conversion found for: %s [%s]", successfulConversion.getKey(), successfulConversion.getValue()));
        }
        for (Map.Entry<String, String> failedConverion : this.failedConversions.entrySet()) {
            log.error(String.format("Failed conversion: %s [%s]", failedConverion.getKey(), failedConverion.getValue()));
        }
        for (Map.Entry<String, String> success : this.dataValidationSuccessful.entrySet()) {
            log.info(String.format("Data validation successful: %s [%s]", success.getKey(), success.getValue()));
        }
        for (Map.Entry<String, String> failed : this.dataValidationFailed.entrySet()) {
            log.error(String.format("Data validation failed: %s [%s]", failed.getKey(), failed.getValue()));
        }
        if (!this.failedConversions.isEmpty() || !this.dataValidationFailed.isEmpty()) {
            throw new RuntimeException(String.format("Validation failed for %s conversions. See previous logs for exact validation failures", failedConversions.size()));
        }
        if (oneFutureFailure) {
            throw new RuntimeException("At least one hive ddl failed. Check previous logs");
        }
    } catch (IOException e) {
        Throwables.propagate(e);
    }
}
Also used : ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) IOException(java.io.IOException) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 14 with HiveDataset

use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.

the class HiveMaterializerSource method getHiveDataset.

private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException {
    try {
        HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY)));
        List<String> tokens = Splitter.on(".").splitToList(tableString);
        DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1));
        try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
            Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable()));
            return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties()));
        }
    } catch (TException exc) {
        throw new RuntimeException(exc);
    }
}
Also used : TException(org.apache.thrift.TException) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) HiveMetastoreClientPool(org.apache.gobblin.hive.HiveMetastoreClientPool)

Aggregations

HiveDataset (org.apache.gobblin.data.management.copy.hive.HiveDataset)14 IMetaStoreClient (org.apache.hadoop.hive.metastore.IMetaStoreClient)8 IOException (java.io.IOException)6 Configuration (org.apache.hadoop.conf.Configuration)6 Table (org.apache.hadoop.hive.ql.metadata.Table)5 Properties (java.util.Properties)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 ArrayList (java.util.ArrayList)3 ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)3 HiveDatasetFinder (org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 Config (com.typesafe.config.Config)2 File (java.io.File)2 List (java.util.List)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Path (org.apache.hadoop.fs.Path)2 TException (org.apache.thrift.TException)2 Test (org.testng.annotations.Test)2 AbstractJob (azkaban.jobExecutor.AbstractJob)1 Optional (com.google.common.base.Optional)1