Search in sources :

Example 1 with SchemaNotFoundException

use of org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException in project incubator-gobblin by apache.

the class HiveSource method createWorkunitForNonPartitionedTable.

protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
    // Create workunits for tables
    try {
        long tableProcessTime = new DateTime().getMillis();
        long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
        this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
        LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
        if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
            log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
            return;
        }
        if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
            log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
            HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
            LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
            hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
            EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
            if (hiveDataset instanceof ConvertibleHiveDataset) {
                setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                log.info("Added lineage event for dataset " + hiveDataset.getUrn());
            }
            this.workunits.add(hiveWorkUnit);
            log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
        } else {
            log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
        }
    } catch (UpdateNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    } catch (SchemaNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    }
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) DateTime(org.joda.time.DateTime) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 2 with SchemaNotFoundException

use of org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException in project incubator-gobblin by apache.

the class HiveSource method createWorkunitsForPartitionedTable.

protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    boolean setLineageInfo = false;
    long tableProcessTime = new DateTime().getMillis();
    this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
    Optional<String> partitionFilter = Optional.absent();
    // If the table is date partitioned, use the partition name to filter partitions older than lookback
    if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
        partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
        log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
    }
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
    for (Partition sourcePartition : sourcePartitions) {
        if (isOlderThanLookback(sourcePartition)) {
            continue;
        }
        LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
        try {
            if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
                log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
                continue;
            }
            long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
            if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
                log.debug(String.format("Processing partition: %s", sourcePartition));
                long partitionProcessTime = new DateTime().getMillis();
                this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
                LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
                HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
                hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
                EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
                if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
                    setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                    log.info("Added lineage event for dataset " + hiveDataset.getUrn());
                    // Add lineage information only once per hive table
                    setLineageInfo = true;
                }
                workunits.add(hiveWorkUnit);
                log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            } else {
                // If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
                log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            }
        } catch (UpdateNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (SchemaNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (UncheckedExecutionException e) {
            log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
        }
    }
}
Also used : LookbackPartitionFilterGenerator(org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator) Path(org.apache.hadoop.fs.Path) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) DateTime(org.joda.time.DateTime) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)2 ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)2 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 DateTime (org.joda.time.DateTime)2 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)1 LookbackPartitionFilterGenerator (org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator)1 Path (org.apache.hadoop.fs.Path)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1