Search in sources :

Example 1 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class HiveSource method createWorkunitForNonPartitionedTable.

protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
    // Create workunits for tables
    try {
        long tableProcessTime = new DateTime().getMillis();
        long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
        this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
        LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
        if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
            log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
            return;
        }
        if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
            log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
            HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
            LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
            hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
            EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
            if (hiveDataset instanceof ConvertibleHiveDataset) {
                setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                log.info("Added lineage event for dataset " + hiveDataset.getUrn());
            }
            this.workunits.add(hiveWorkUnit);
            log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
        } else {
            log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
        }
    } catch (UpdateNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    } catch (SchemaNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    }
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) DateTime(org.joda.time.DateTime) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 2 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class ValidationJob method processPartitionedTable.

/**
 * Validate all {@link Partition}s for a {@link Table} if it was updated recently by checking if its update time
 * lies between between maxLookBackTime and skipRecentThanTime window.
 * @param hiveDataset {@link HiveDataset} containing {@link Table} and {@link Partition} info.
 * @param client {@link IMetaStoreClient} to query Hive.
 * @throws IOException Issue in validating {@link HiveDataset}
 */
private void processPartitionedTable(ConvertibleHiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    // Get partitions for the table
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent());
    for (final String format : hiveDataset.getDestFormats()) {
        Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigOptional = hiveDataset.getConversionConfigForFormat(format);
        if (conversionConfigOptional.isPresent()) {
            // Get conversion config
            ConvertibleHiveDataset.ConversionConfig conversionConfig = conversionConfigOptional.get();
            String orcTableName = conversionConfig.getDestinationTableName();
            String orcTableDatabase = conversionConfig.getDestinationDbName();
            Pair<Optional<org.apache.hadoop.hive.metastore.api.Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils.getDestinationTableMeta(orcTableDatabase, orcTableName, this.props);
            // Validate each partition
            for (final Partition sourcePartition : sourcePartitions) {
                try {
                    final long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
                    if (shouldValidate(sourcePartition)) {
                        log.info(String.format("Validating partition: %s", sourcePartition.getCompleteName()));
                        // Generate validation queries
                        final List<String> countValidationQueries = HiveValidationQueryGenerator.generateCountValidationQueries(hiveDataset, Optional.of(sourcePartition), conversionConfig);
                        final List<String> dataValidationQueries = Lists.newArrayList(HiveValidationQueryGenerator.generateDataValidationQuery(hiveDataset.getTable().getTableName(), hiveDataset.getTable().getDbName(), destinationMeta.getKey().get(), Optional.of(sourcePartition), this.isNestedORC));
                        this.futures.add(this.exec.submit(new Callable<Void>() {

                            @Override
                            public Void call() throws Exception {
                                // Execute validation queries
                                log.debug(String.format("Going to execute count validation queries queries: %s for format: %s " + "and partition %s", countValidationQueries, format, sourcePartition.getCompleteName()));
                                List<Long> rowCounts = ValidationJob.this.getValidationOutputFromHive(countValidationQueries);
                                log.debug(String.format("Going to execute data validation queries: %s for format: %s and partition %s", dataValidationQueries, format, sourcePartition.getCompleteName()));
                                List<Long> rowDataValidatedCount = ValidationJob.this.getValidationOutputFromHive(dataValidationQueries);
                                // Validate and populate report
                                validateAndPopulateReport(sourcePartition.getCompleteName(), updateTime, rowCounts, rowDataValidatedCount);
                                return null;
                            }
                        }));
                    } else {
                        log.debug(String.format("Not validating partition: %s as updateTime: %s is not in range of max look back: %s " + "and skip recent than: %s", sourcePartition.getCompleteName(), updateTime, this.maxLookBackTime, this.skipRecentThanTime));
                    }
                } catch (UncheckedExecutionException e) {
                    log.warn(String.format("Not validating partition: %s %s", sourcePartition.getCompleteName(), e.getMessage()));
                } catch (UpdateNotFoundException e) {
                    log.warn(String.format("Not validating partition: %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
                }
            }
        } else {
            log.info(String.format("No conversion config found for format %s. Ignoring data validation", format));
        }
    }
}
Also used : UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) Optional(com.google.common.base.Optional) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) Callable(java.util.concurrent.Callable)

Example 3 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class HiveAvroToOrcConverterTest method testNestedSchemaDDLandDML.

/**
 * Test nested DDL and DML generation
 * @throws IOException
 */
@Test
public void testNestedSchemaDDLandDML() throws Exception {
    String dbName = "testdb";
    String tableName = "testtable";
    String tableSdLoc = "/tmp/testtable";
    this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
    Table table = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.<String>absent());
    Schema schema = ConversionHiveTestUtils.readSchemaFromJsonFile(resourceDir, "recordWithinRecordWithinRecord_nested.json");
    WorkUnitState wus = ConversionHiveTestUtils.createWus(dbName, tableName, 0);
    wus.getJobState().setProp("orc.table.flatten.schema", "false");
    try (HiveAvroToNestedOrcConverter converter = new HiveAvroToNestedOrcConverter()) {
        Config config = ConfigFactory.parseMap(ImmutableMap.<String, String>builder().put("destinationFormats", "nestedOrc").put("nestedOrc.destination.tableName", "testtable_orc_nested").put("nestedOrc.destination.dbName", dbName).put("nestedOrc.destination.dataPath", "file:/tmp/testtable_orc_nested").build());
        ConvertibleHiveDataset cd = ConvertibleHiveDatasetTest.createTestConvertibleDataset(config);
        List<QueryBasedHiveConversionEntity> conversionEntities = Lists.newArrayList(converter.convertRecord(converter.convertSchema(schema, wus), new QueryBasedHiveConversionEntity(cd, new SchemaAwareHiveTable(table, schema)), wus));
        Assert.assertEquals(conversionEntities.size(), 1, "Only one query entity should be returned");
        QueryBasedHiveConversionEntity queryBasedHiveConversionEntity = conversionEntities.get(0);
        List<String> queries = queryBasedHiveConversionEntity.getQueries();
        Assert.assertEquals(queries.size(), 4, "4 DDL and one DML query should be returned");
        // Ignoring part before first bracket in DDL and 'select' clause in DML because staging table has
        // .. a random name component
        String actualDDLQuery = StringUtils.substringAfter("(", queries.get(0).trim());
        String actualDMLQuery = StringUtils.substringAfter("SELECT", queries.get(0).trim());
        String expectedDDLQuery = StringUtils.substringAfter("(", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_nested.ddl"));
        String expectedDMLQuery = StringUtils.substringAfter("SELECT", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_nested.dml"));
        Assert.assertEquals(actualDDLQuery, expectedDDLQuery);
        Assert.assertEquals(actualDMLQuery, expectedDMLQuery);
    }
}
Also used : Table(org.apache.hadoop.hive.metastore.api.Table) SchemaAwareHiveTable(org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Config(com.typesafe.config.Config) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) Schema(org.apache.avro.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) QueryBasedHiveConversionEntity(org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHiveConversionEntity) SchemaAwareHiveTable(org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable) ConvertibleHiveDatasetTest(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetTest) Test(org.testng.annotations.Test)

Example 4 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class HiveSource method createWorkunitsForPartitionedTable.

protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    boolean setLineageInfo = false;
    long tableProcessTime = new DateTime().getMillis();
    this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
    Optional<String> partitionFilter = Optional.absent();
    // If the table is date partitioned, use the partition name to filter partitions older than lookback
    if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
        partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
        log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
    }
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
    for (Partition sourcePartition : sourcePartitions) {
        if (isOlderThanLookback(sourcePartition)) {
            continue;
        }
        LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
        try {
            if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
                log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
                continue;
            }
            long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
            if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
                log.debug(String.format("Processing partition: %s", sourcePartition));
                long partitionProcessTime = new DateTime().getMillis();
                this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
                LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
                HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
                hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
                EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
                if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
                    setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                    log.info("Added lineage event for dataset " + hiveDataset.getUrn());
                    // Add lineage information only once per hive table
                    setLineageInfo = true;
                }
                workunits.add(hiveWorkUnit);
                log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            } else {
                // If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
                log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            }
        } catch (UpdateNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (SchemaNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (UncheckedExecutionException e) {
            log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
        }
    }
}
Also used : LookbackPartitionFilterGenerator(org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator) Path(org.apache.hadoop.fs.Path) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) DateTime(org.joda.time.DateTime) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 5 with ConvertibleHiveDataset

use of org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset in project incubator-gobblin by apache.

the class HiveAvroToOrcConverterTest method testFlattenSchemaDDLandDML.

/**
 * Test flattened DDL and DML generation
 * @throws IOException
 */
@Test
public void testFlattenSchemaDDLandDML() throws Exception {
    String dbName = "testdb";
    String tableName = "testtable";
    String tableSdLoc = "/tmp/testtable";
    this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
    Table table = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.<String>absent());
    Schema schema = ConversionHiveTestUtils.readSchemaFromJsonFile(resourceDir, "recordWithinRecordWithinRecord_nested.json");
    WorkUnitState wus = ConversionHiveTestUtils.createWus(dbName, tableName, 0);
    try (HiveAvroToFlattenedOrcConverter converter = new HiveAvroToFlattenedOrcConverter()) {
        Config config = ConfigFactory.parseMap(ImmutableMap.<String, String>builder().put("destinationFormats", "flattenedOrc").put("flattenedOrc.destination.dbName", dbName).put("flattenedOrc.destination.tableName", tableName + "_orc").put("flattenedOrc.destination.dataPath", "file:" + tableSdLoc + "_orc").build());
        ConvertibleHiveDataset cd = ConvertibleHiveDatasetTest.createTestConvertibleDataset(config);
        List<QueryBasedHiveConversionEntity> conversionEntities = Lists.newArrayList(converter.convertRecord(converter.convertSchema(schema, wus), new QueryBasedHiveConversionEntity(cd, new SchemaAwareHiveTable(table, schema)), wus));
        Assert.assertEquals(conversionEntities.size(), 1, "Only one query entity should be returned");
        QueryBasedHiveConversionEntity queryBasedHiveConversionEntity = conversionEntities.get(0);
        List<String> queries = queryBasedHiveConversionEntity.getQueries();
        Assert.assertEquals(queries.size(), 4, "4 DDL and one DML query should be returned");
        // Ignoring part before first bracket in DDL and 'select' clause in DML because staging table has
        // .. a random name component
        String actualDDLQuery = StringUtils.substringAfter("(", queries.get(0).trim());
        String actualDMLQuery = StringUtils.substringAfter("SELECT", queries.get(0).trim());
        String expectedDDLQuery = StringUtils.substringAfter("(", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_flattened.ddl"));
        String expectedDMLQuery = StringUtils.substringAfter("SELECT", ConversionHiveTestUtils.readQueryFromFile(resourceDir, "recordWithinRecordWithinRecord_flattened.dml"));
        Assert.assertEquals(actualDDLQuery, expectedDDLQuery);
        Assert.assertEquals(actualDMLQuery, expectedDMLQuery);
    }
}
Also used : Table(org.apache.hadoop.hive.metastore.api.Table) SchemaAwareHiveTable(org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Config(com.typesafe.config.Config) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) Schema(org.apache.avro.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) QueryBasedHiveConversionEntity(org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHiveConversionEntity) SchemaAwareHiveTable(org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable) ConvertibleHiveDatasetTest(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetTest) Test(org.testng.annotations.Test)

Aggregations

ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)8 Config (com.typesafe.config.Config)3 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)3 Partition (org.apache.hadoop.hive.ql.metadata.Partition)3 Optional (com.google.common.base.Optional)2 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)2 IOException (java.io.IOException)2 Schema (org.apache.avro.Schema)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)2 ConvertibleHiveDatasetTest (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetTest)2 QueryBasedHiveConversionEntity (org.apache.gobblin.data.management.conversion.hive.entities.QueryBasedHiveConversionEntity)2 SchemaAwareHiveTable (org.apache.gobblin.data.management.conversion.hive.entities.SchemaAwareHiveTable)2 HiveDataset (org.apache.gobblin.data.management.copy.hive.HiveDataset)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 Path (org.apache.hadoop.fs.Path)2 IMetaStoreClient (org.apache.hadoop.hive.metastore.IMetaStoreClient)2 AbstractJob (azkaban.jobExecutor.AbstractJob)1 ImmutableMap (com.google.common.collect.ImmutableMap)1