Search in sources :

Example 1 with UpdateNotFoundException

use of org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException in project incubator-gobblin by apache.

the class HiveSource method createWorkunitForNonPartitionedTable.

protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
    // Create workunits for tables
    try {
        long tableProcessTime = new DateTime().getMillis();
        long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
        this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
        LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
        if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
            log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
            return;
        }
        if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
            log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
            HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
            LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
            hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
            EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
            if (hiveDataset instanceof ConvertibleHiveDataset) {
                setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                log.info("Added lineage event for dataset " + hiveDataset.getUrn());
            }
            this.workunits.add(hiveWorkUnit);
            log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
        } else {
            log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
        }
    } catch (UpdateNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    } catch (SchemaNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    }
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) DateTime(org.joda.time.DateTime) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 2 with UpdateNotFoundException

use of org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException in project incubator-gobblin by apache.

the class ValidationJob method processPartitionedTable.

/**
 * Validate all {@link Partition}s for a {@link Table} if it was updated recently by checking if its update time
 * lies between between maxLookBackTime and skipRecentThanTime window.
 * @param hiveDataset {@link HiveDataset} containing {@link Table} and {@link Partition} info.
 * @param client {@link IMetaStoreClient} to query Hive.
 * @throws IOException Issue in validating {@link HiveDataset}
 */
private void processPartitionedTable(ConvertibleHiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    // Get partitions for the table
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent());
    for (final String format : hiveDataset.getDestFormats()) {
        Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigOptional = hiveDataset.getConversionConfigForFormat(format);
        if (conversionConfigOptional.isPresent()) {
            // Get conversion config
            ConvertibleHiveDataset.ConversionConfig conversionConfig = conversionConfigOptional.get();
            String orcTableName = conversionConfig.getDestinationTableName();
            String orcTableDatabase = conversionConfig.getDestinationDbName();
            Pair<Optional<org.apache.hadoop.hive.metastore.api.Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils.getDestinationTableMeta(orcTableDatabase, orcTableName, this.props);
            // Validate each partition
            for (final Partition sourcePartition : sourcePartitions) {
                try {
                    final long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
                    if (shouldValidate(sourcePartition)) {
                        log.info(String.format("Validating partition: %s", sourcePartition.getCompleteName()));
                        // Generate validation queries
                        final List<String> countValidationQueries = HiveValidationQueryGenerator.generateCountValidationQueries(hiveDataset, Optional.of(sourcePartition), conversionConfig);
                        final List<String> dataValidationQueries = Lists.newArrayList(HiveValidationQueryGenerator.generateDataValidationQuery(hiveDataset.getTable().getTableName(), hiveDataset.getTable().getDbName(), destinationMeta.getKey().get(), Optional.of(sourcePartition), this.isNestedORC));
                        this.futures.add(this.exec.submit(new Callable<Void>() {

                            @Override
                            public Void call() throws Exception {
                                // Execute validation queries
                                log.debug(String.format("Going to execute count validation queries queries: %s for format: %s " + "and partition %s", countValidationQueries, format, sourcePartition.getCompleteName()));
                                List<Long> rowCounts = ValidationJob.this.getValidationOutputFromHive(countValidationQueries);
                                log.debug(String.format("Going to execute data validation queries: %s for format: %s and partition %s", dataValidationQueries, format, sourcePartition.getCompleteName()));
                                List<Long> rowDataValidatedCount = ValidationJob.this.getValidationOutputFromHive(dataValidationQueries);
                                // Validate and populate report
                                validateAndPopulateReport(sourcePartition.getCompleteName(), updateTime, rowCounts, rowDataValidatedCount);
                                return null;
                            }
                        }));
                    } else {
                        log.debug(String.format("Not validating partition: %s as updateTime: %s is not in range of max look back: %s " + "and skip recent than: %s", sourcePartition.getCompleteName(), updateTime, this.maxLookBackTime, this.skipRecentThanTime));
                    }
                } catch (UncheckedExecutionException e) {
                    log.warn(String.format("Not validating partition: %s %s", sourcePartition.getCompleteName(), e.getMessage()));
                } catch (UpdateNotFoundException e) {
                    log.warn(String.format("Not validating partition: %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
                }
            }
        } else {
            log.info(String.format("No conversion config found for format %s. Ignoring data validation", format));
        }
    }
}
Also used : UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) Optional(com.google.common.base.Optional) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) Callable(java.util.concurrent.Callable)

Example 3 with UpdateNotFoundException

use of org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException in project incubator-gobblin by apache.

the class HiveSource method createWorkunitsForPartitionedTable.

protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    boolean setLineageInfo = false;
    long tableProcessTime = new DateTime().getMillis();
    this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
    Optional<String> partitionFilter = Optional.absent();
    // If the table is date partitioned, use the partition name to filter partitions older than lookback
    if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
        partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
        log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
    }
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
    for (Partition sourcePartition : sourcePartitions) {
        if (isOlderThanLookback(sourcePartition)) {
            continue;
        }
        LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
        try {
            if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
                log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
                continue;
            }
            long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
            if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
                log.debug(String.format("Processing partition: %s", sourcePartition));
                long partitionProcessTime = new DateTime().getMillis();
                this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
                LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
                HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
                hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
                EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
                if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
                    setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                    log.info("Added lineage event for dataset " + hiveDataset.getUrn());
                    // Add lineage information only once per hive table
                    setLineageInfo = true;
                }
                workunits.add(hiveWorkUnit);
                log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            } else {
                // If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
                log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            }
        } catch (UpdateNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (SchemaNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (UncheckedExecutionException e) {
            log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
        }
    }
}
Also used : LookbackPartitionFilterGenerator(org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator) Path(org.apache.hadoop.fs.Path) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) DateTime(org.joda.time.DateTime) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)3 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)3 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)2 SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 DateTime (org.joda.time.DateTime)2 Optional (com.google.common.base.Optional)1 Callable (java.util.concurrent.Callable)1 LookbackPartitionFilterGenerator (org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator)1 Path (org.apache.hadoop.fs.Path)1