use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.
the class HiveSource method createWorkunitForNonPartitionedTable.
protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
// Create workunits for tables
try {
long tableProcessTime = new DateTime().getMillis();
long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
return;
}
if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
}
this.workunits.add(hiveWorkUnit);
log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
} else {
log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
} catch (SchemaNotFoundException e) {
log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
}
}
use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.
the class ValidationJob method processPartitionedTable.
/**
* Validate all {@link Partition}s for a {@link Table} if it was updated recently by checking if its update time
* lies between between maxLookBackTime and skipRecentThanTime window.
* @param hiveDataset {@link HiveDataset} containing {@link Table} and {@link Partition} info.
* @param client {@link IMetaStoreClient} to query Hive.
* @throws IOException Issue in validating {@link HiveDataset}
*/
private void processPartitionedTable(ConvertibleHiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
// Get partitions for the table
List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent());
for (final String format : hiveDataset.getDestFormats()) {
Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigOptional = hiveDataset.getConversionConfigForFormat(format);
if (conversionConfigOptional.isPresent()) {
// Get conversion config
ConvertibleHiveDataset.ConversionConfig conversionConfig = conversionConfigOptional.get();
String orcTableName = conversionConfig.getDestinationTableName();
String orcTableDatabase = conversionConfig.getDestinationDbName();
Pair<Optional<org.apache.hadoop.hive.metastore.api.Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils.getDestinationTableMeta(orcTableDatabase, orcTableName, this.props);
// Validate each partition
for (final Partition sourcePartition : sourcePartitions) {
try {
final long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
if (shouldValidate(sourcePartition)) {
log.info(String.format("Validating partition: %s", sourcePartition.getCompleteName()));
// Generate validation queries
final List<String> countValidationQueries = HiveValidationQueryGenerator.generateCountValidationQueries(hiveDataset, Optional.of(sourcePartition), conversionConfig);
final List<String> dataValidationQueries = Lists.newArrayList(HiveValidationQueryGenerator.generateDataValidationQuery(hiveDataset.getTable().getTableName(), hiveDataset.getTable().getDbName(), destinationMeta.getKey().get(), Optional.of(sourcePartition), this.isNestedORC));
this.futures.add(this.exec.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
// Execute validation queries
log.debug(String.format("Going to execute count validation queries queries: %s for format: %s " + "and partition %s", countValidationQueries, format, sourcePartition.getCompleteName()));
List<Long> rowCounts = ValidationJob.this.getValidationOutputFromHive(countValidationQueries);
log.debug(String.format("Going to execute data validation queries: %s for format: %s and partition %s", dataValidationQueries, format, sourcePartition.getCompleteName()));
List<Long> rowDataValidatedCount = ValidationJob.this.getValidationOutputFromHive(dataValidationQueries);
// Validate and populate report
validateAndPopulateReport(sourcePartition.getCompleteName(), updateTime, rowCounts, rowDataValidatedCount);
return null;
}
}));
} else {
log.debug(String.format("Not validating partition: %s as updateTime: %s is not in range of max look back: %s " + "and skip recent than: %s", sourcePartition.getCompleteName(), updateTime, this.maxLookBackTime, this.skipRecentThanTime));
}
} catch (UncheckedExecutionException e) {
log.warn(String.format("Not validating partition: %s %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (UpdateNotFoundException e) {
log.warn(String.format("Not validating partition: %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
}
}
} else {
log.info(String.format("No conversion config found for format %s. Ignoring data validation", format));
}
}
}
use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.
the class HiveAvroToOrcConverterTest method testNestedSchemaDDLandDML.
/**
* Test nested DDL and DML generation
* @throws IOException
*/
@Test
public void testNestedSchemaDDLandDML() throws Exception {
String dbName = "testdb";
String tableName = "testtable";
String tableSdLoc = "/tmp/testtable";
this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
Table table = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.<String>absent());
Schema schema = ConversionHiveTestUtils.readSchemaFromJsonFile(resourceDir, "recordWithinRecordWithinRecord_nested.json");
WorkUnitState wus = ConversionHiveTestUtils.createWus(dbName, tableName, 0);
wus.getJobState().setProp("orc.table.flatten.schema", "false");
try (HiveAvroToNestedOrcConverter converter = new HiveAvroToNestedOrcConverter()) {
Config config = ConfigFactory.parseMap(ImmutableMap.<String, String>builder().put("destinationFormats", "nestedOrc").put("nestedOrc.destination.tableName", "testtable_orc_nested").put("nestedOrc.destination.dbName", dbName).put("nestedOrc.destination.dataPath", "file:/tmp/testtable_orc_nested").build());
ConvertibleHiveDataset cd = ConvertibleHiveDatasetTest.createTestConvertibleDataset(config);
List<QueryBasedHiveConversionEntity> conversionEntities = Lists.newArrayList(converter.convertRecord(converter.convertSchema(schema, wus), new QueryBasedHiveConversionEntity(cd, new SchemaAwareHiveTable(table, schema)), wus));
Assert.assertEquals(conversionEntities.size(), 1, "Only one query entity should be returned");
QueryBasedHiveConversionEntity queryBasedHiveConversionEntity = conversionEntities.get(0);
List<String> queries = queryBasedHiveConversionEntity.getQueries();
Assert.assertEquals(queries.size(), 4, "4 DDL and one DML query should be returned");
// Ignoring part before first bracket in DDL and 'select' clause in DML because staging table has
// .. a random name component
String actualDDLQuery = StringUtils.substringAfter("(", queries.get(0).trim());
String actualDMLQuery = StringUtils.substringAfter("SELECT", queries.get(0).trim());
String expectedDDLQuery = StringUtils.substringAfter("(", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_nested.ddl"));
String expectedDMLQuery = StringUtils.substringAfter("SELECT", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_nested.dml"));
Assert.assertEquals(actualDDLQuery, expectedDDLQuery);
Assert.assertEquals(actualDMLQuery, expectedDMLQuery);
}
}
use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.
the class HiveSource method createWorkunitsForPartitionedTable.
protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
boolean setLineageInfo = false;
long tableProcessTime = new DateTime().getMillis();
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
Optional<String> partitionFilter = Optional.absent();
// If the table is date partitioned, use the partition name to filter partitions older than lookback
if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
}
List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
for (Partition sourcePartition : sourcePartitions) {
if (isOlderThanLookback(sourcePartition)) {
continue;
}
LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
try {
if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
continue;
}
long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
log.debug(String.format("Processing partition: %s", sourcePartition));
long partitionProcessTime = new DateTime().getMillis();
this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
// Add lineage information only once per hive table
setLineageInfo = true;
}
workunits.add(hiveWorkUnit);
log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
} else {
// If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (SchemaNotFoundException e) {
log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (UncheckedExecutionException e) {
log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
}
}
}
use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.
the class HiveAvroToOrcConverterTest method testFlattenSchemaDDLandDML.
/**
* Test flattened DDL and DML generation
* @throws IOException
*/
@Test
public void testFlattenSchemaDDLandDML() throws Exception {
String dbName = "testdb";
String tableName = "testtable";
String tableSdLoc = "/tmp/testtable";
this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
Table table = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.<String>absent());
Schema schema = ConversionHiveTestUtils.readSchemaFromJsonFile(resourceDir, "recordWithinRecordWithinRecord_nested.json");
WorkUnitState wus = ConversionHiveTestUtils.createWus(dbName, tableName, 0);
try (HiveAvroToFlattenedOrcConverter converter = new HiveAvroToFlattenedOrcConverter()) {
Config config = ConfigFactory.parseMap(ImmutableMap.<String, String>builder().put("destinationFormats", "flattenedOrc").put("flattenedOrc.destination.dbName", dbName).put("flattenedOrc.destination.tableName", tableName + "_orc").put("flattenedOrc.destination.dataPath", "file:" + tableSdLoc + "_orc").build());
ConvertibleHiveDataset cd = ConvertibleHiveDatasetTest.createTestConvertibleDataset(config);
List<QueryBasedHiveConversionEntity> conversionEntities = Lists.newArrayList(converter.convertRecord(converter.convertSchema(schema, wus), new QueryBasedHiveConversionEntity(cd, new SchemaAwareHiveTable(table, schema)), wus));
Assert.assertEquals(conversionEntities.size(), 1, "Only one query entity should be returned");
QueryBasedHiveConversionEntity queryBasedHiveConversionEntity = conversionEntities.get(0);
List<String> queries = queryBasedHiveConversionEntity.getQueries();
Assert.assertEquals(queries.size(), 4, "4 DDL and one DML query should be returned");
// Ignoring part before first bracket in DDL and 'select' clause in DML because staging table has
// .. a random name component
String actualDDLQuery = StringUtils.substringAfter("(", queries.get(0).trim());
String actualDMLQuery = StringUtils.substringAfter("SELECT", queries.get(0).trim());
String expectedDDLQuery = StringUtils.substringAfter("(", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_flattened.ddl"));
String expectedDMLQuery = StringUtils.substringAfter("SELECT", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_flattened.dml"));
Assert.assertEquals(actualDDLQuery, expectedDDLQuery);
Assert.assertEquals(actualDMLQuery, expectedDMLQuery);
}
}
Aggregations