use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class RegistrationTimeSkipPredicateTest method test.
@Test
public void test() throws Exception {
Path partition1Path = new Path("/path/to/partition1");
long modTime = 100000;
CopyContext copyContext = new CopyContext();
CopyConfiguration copyConfiguration = Mockito.mock(CopyConfiguration.class);
Mockito.doReturn(copyContext).when(copyConfiguration).getCopyContext();
HiveDataset dataset = Mockito.mock(HiveDataset.class);
FileSystem fs = Mockito.spy(FileSystem.getLocal(new Configuration()));
FileStatus status = new FileStatus(1, false, 1, 1, modTime, partition1Path);
Path qualifiedPath = fs.makeQualified(partition1Path);
Mockito.doReturn(status).when(fs).getFileStatus(qualifiedPath);
Mockito.doReturn(status).when(fs).getFileStatus(partition1Path);
Mockito.doReturn(fs).when(dataset).getFs();
HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
Mockito.doReturn(copyConfiguration).when(helper).getConfiguration();
Mockito.doReturn(dataset).when(helper).getDataset();
RegistrationTimeSkipPredicate predicate = new RegistrationTimeSkipPredicate(helper);
// partition exists, but registration time before modtime => don't skip
HivePartitionFileSet pc = createPartitionCopy(partition1Path, modTime - 1, true);
Assert.assertFalse(predicate.apply(pc));
// partition exists, registration time equal modtime => don't skip
pc = createPartitionCopy(partition1Path, modTime, true);
Assert.assertFalse(predicate.apply(pc));
// partition exists, registration time larger modtime => do skip
pc = createPartitionCopy(partition1Path, modTime + 1, true);
Assert.assertTrue(predicate.apply(pc));
// partition doesn't exist => don't skip
pc = createPartitionCopy(partition1Path, modTime + 1, false);
Assert.assertFalse(predicate.apply(pc));
// partition exists but is not annotated => don't skip
pc = createPartitionCopy(partition1Path, modTime + 1, true);
pc.getExistingTargetPartition().get().getParameters().clear();
Assert.assertFalse(predicate.apply(pc));
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class Avro2OrcStaleDatasetCleaner method run.
@Override
public void run() throws Exception {
Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
while (iterator.hasNext()) {
ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
Set<Partition> sourcePartitions = new HashSet<>(HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent()));
sourcePartitions.parallelStream().filter(partition -> isUnixTimeStamp(partition.getDataLocation().getName())).forEach(partition -> {
Arrays.stream(listFiles(partition.getDataLocation().getParent())).filter(fileStatus -> !fileStatus.getPath().toString().equalsIgnoreCase(partition.getDataLocation().toString())).forEach(fileStatus -> {
deletePath(fileStatus, this.graceTimeInMillis, true);
});
});
}
}
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class ValidationJob method runCountValidation.
private void runCountValidation() throws InterruptedException {
try {
// Validation results
this.successfulConversions = Maps.newConcurrentMap();
this.failedConversions = Maps.newConcurrentMap();
this.warnConversions = Maps.newConcurrentMap();
this.dataValidationFailed = Maps.newConcurrentMap();
this.dataValidationSuccessful = Maps.newConcurrentMap();
// Find datasets to validate
Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.VALIDATION_FIND_HIVE_TABLES_EVENT);
while (iterator.hasNext()) {
ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
// Validate dataset
log.info(String.format("Validating dataset: %s", hiveDataset));
if (HiveUtils.isPartitioned(hiveDataset.getTable())) {
processPartitionedTable(hiveDataset, client);
} else {
processNonPartitionedTable(hiveDataset);
}
}
}
// Wait for all validation queries to finish
log.info(String.format("Waiting for %d futures to complete", this.futures.size()));
this.exec.shutdown();
this.exec.awaitTermination(4, TimeUnit.HOURS);
boolean oneFutureFailure = false;
// Check if there were any exceptions
for (Future<Void> future : this.futures) {
try {
future.get();
} catch (Throwable t) {
log.error("getValidationOutputFromHive failed", t);
oneFutureFailure = true;
}
}
// These are then converted into log lines in the Azkaban logs as done below
for (Map.Entry<String, String> successfulConversion : this.successfulConversions.entrySet()) {
log.info(String.format("Successful conversion: %s [%s]", successfulConversion.getKey(), successfulConversion.getValue()));
}
for (Map.Entry<String, String> successfulConversion : this.warnConversions.entrySet()) {
log.warn(String.format("No conversion found for: %s [%s]", successfulConversion.getKey(), successfulConversion.getValue()));
}
for (Map.Entry<String, String> failedConverion : this.failedConversions.entrySet()) {
log.error(String.format("Failed conversion: %s [%s]", failedConverion.getKey(), failedConverion.getValue()));
}
for (Map.Entry<String, String> success : this.dataValidationSuccessful.entrySet()) {
log.info(String.format("Data validation successful: %s [%s]", success.getKey(), success.getValue()));
}
for (Map.Entry<String, String> failed : this.dataValidationFailed.entrySet()) {
log.error(String.format("Data validation failed: %s [%s]", failed.getKey(), failed.getValue()));
}
if (!this.failedConversions.isEmpty() || !this.dataValidationFailed.isEmpty()) {
throw new RuntimeException(String.format("Validation failed for %s conversions. See previous logs for exact validation failures", failedConversions.size()));
}
if (oneFutureFailure) {
throw new RuntimeException("At least one hive ddl failed. Check previous logs");
}
} catch (IOException e) {
Throwables.propagate(e);
}
}
use of org.apache.gobblin.data.management.copy.hive.HiveDataset in project incubator-gobblin by apache.
the class HiveMaterializerSource method getHiveDataset.
private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException {
try {
HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY)));
List<String> tokens = Splitter.on(".").splitToList(tableString);
DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1));
try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable()));
return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties()));
}
} catch (TException exc) {
throw new RuntimeException(exc);
}
}
Aggregations