Search in sources :

Example 1 with PartitionSpec

use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.

the class AbstractMergeTable method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    final ComponentLog logger = getLog();
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final String blockingValue = context.getProperty(BLOCKING_KEY).evaluateAttributeExpressions(flowFile).getValue();
    String flowFileId = flowFile.getAttribute(CoreAttributes.UUID.key());
    boolean block = false;
    if (blocking && blockingCache.putIfAbsent(blockingValue, flowFileId) != null) {
        if (StringUtils.isBlank(flowFile.getAttribute(BLOCKED_START_TIME))) {
            flowFile = session.putAttribute(flowFile, BLOCKED_START_TIME, String.valueOf(System.currentTimeMillis()));
            getLogger().info("Transferring Flow file {} to blocked relationship", new Object[] { flowFile });
        }
        // penalize the flow file and transfer to BLOCKED
        flowFile = session.penalize(flowFile);
        session.transfer(flowFile, REL_BLOCKED);
        return;
    }
    // Add Blocking time to flow file if this was a blocked flowfile.
    if (blocking && StringUtils.isNotBlank(flowFile.getAttribute(BLOCKED_START_TIME))) {
        String blockedStartTime = flowFile.getAttribute(BLOCKED_START_TIME);
        try {
            Long l = Long.parseLong(blockedStartTime);
            Long blockTime = System.currentTimeMillis() - l;
            getLogger().info("Processing Blocked flow file {}.  This was blocked for {} ms", new Object[] { flowFile, blockTime });
            flowFile = session.putAttribute(flowFile, BLOCKED_TIME, String.valueOf(blockTime) + " ms");
        } catch (NumberFormatException e) {
        }
    }
    String PROVENANCE_EXECUTION_STATUS_KEY = context.getName() + " Execution Status";
    String partitionSpecString = context.getProperty(PARTITION_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue();
    String sourceSchema = context.getProperty(SOURCE_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
    String sourceTable = context.getProperty(SOURCE_TABLE).evaluateAttributeExpressions(flowFile).getValue();
    String targetSchema = context.getProperty(TARGET_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
    String targetTable = context.getProperty(TARGET_TABLE).evaluateAttributeExpressions(flowFile).getValue();
    String feedPartitionValue = context.getProperty(FEED_PARTITION).evaluateAttributeExpressions(flowFile).getValue();
    String mergeStrategyValue = context.getProperty(MERGE_STRATEGY).evaluateAttributeExpressions(flowFile).getValue();
    String hiveConfigurations = context.getProperty(HIVE_CONFIGURATIONS).evaluateAttributeExpressions(flowFile).getValue();
    boolean resetHive = context.getProperty(RESET_HIVE).asBoolean();
    final ColumnSpec[] columnSpecs = Optional.ofNullable(context.getProperty(FIELD_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue()).filter(StringUtils::isNotEmpty).map(ColumnSpec::createFromString).orElse(new ColumnSpec[0]);
    if (STRATEGY_PK_MERGE.equals(mergeStrategyValue) && (columnSpecs == null || columnSpecs.length == 0)) {
        getLog().error("Missing required field specification for PK merge feature");
        flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: Missing required field specification for PK merge feature");
        release(blockingValue);
        session.transfer(flowFile, IngestProperties.REL_FAILURE);
        return;
    }
    // Maintain default for backward compatibility
    if (StringUtils.isEmpty(mergeStrategyValue)) {
        mergeStrategyValue = STRATEGY_DEDUPE_MERGE;
    }
    logger.info("Merge strategy: " + mergeStrategyValue + " Using Source: " + sourceTable + " Target: " + targetTable + " feed partition:" + feedPartitionValue + " partSpec: " + partitionSpecString);
    final StopWatch stopWatch = new StopWatch(true);
    try (final Connection conn = getConnection(context)) {
        TableMergeSyncSupport mergeSupport = new TableMergeSyncSupport(conn);
        if (resetHive) {
            mergeSupport.resetHiveConf();
        }
        mergeSupport.enableDynamicPartitions();
        if (StringUtils.isNotEmpty(hiveConfigurations)) {
            mergeSupport.setHiveConf(hiveConfigurations.split("\\|"));
        }
        PartitionSpec partitionSpec = new PartitionSpec(partitionSpecString);
        if (STRATEGY_DEDUPE_MERGE.equals(mergeStrategyValue)) {
            mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, true);
        } else if (STRATEGY_MERGE.equals(mergeStrategyValue)) {
            mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, false);
        } else if (STRATEGY_SYNC.equals(mergeStrategyValue)) {
            mergeSupport.doSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue);
        } else if (STRATEGY_ROLLING_SYNC.equals(mergeStrategyValue)) {
            mergeSupport.doRollingSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue);
        } else if (STRATEGY_PK_MERGE.equals(mergeStrategyValue)) {
            mergeSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, columnSpecs);
        } else {
            throw new UnsupportedOperationException("Failed to resolve the merge strategy");
        }
        session.getProvenanceReporter().modifyContent(flowFile, "Execution completed", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
        flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Successful");
        release(blockingValue);
        logger.info("Execution completed: " + stopWatch.getElapsed(TimeUnit.MILLISECONDS) + " Merge strategy: " + mergeStrategyValue + " Using Source: " + sourceTable + " Target: " + targetTable + " feed partition:" + feedPartitionValue + " partSpec: " + partitionSpecString);
        session.transfer(flowFile, REL_SUCCESS);
    } catch (final Exception e) {
        logger.error("Unable to execute merge doMerge for {} due to {}; routing to failure", new Object[] { flowFile, e }, e);
        flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: " + e.getMessage());
        release(blockingValue);
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) Connection(java.sql.Connection) ComponentLog(org.apache.nifi.logging.ComponentLog) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) ProcessException(org.apache.nifi.processor.exception.ProcessException) StopWatch(org.apache.nifi.util.StopWatch) StringUtils(org.apache.commons.lang3.StringUtils) TableMergeSyncSupport(com.thinkbiganalytics.ingest.TableMergeSyncSupport)

Example 2 with PartitionSpec

use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.

the class TableMergeSyncSupport method createPartitionBatchesforPKMerge.

/**
 * Finds all partitions that contain matching keys.
 *
 * @param spec               the partition spec
 * @param sourceSchema       the name of the source table schema or database
 * @param sourceTable        the source table
 * @param targetSchema       the name of the target table schema or database
 * @param targetTable        the target table
 * @param feedPartitionValue the partition of the source table to use
 * @param joinOnClause       the JOIN clause for the source and target tables
 * @return the matching partitions
 */
protected List<PartitionBatch> createPartitionBatchesforPKMerge(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final String joinOnClause) {
    List<PartitionBatch> v;
    PartitionSpec aliasSpecA = spec.newForAlias("a");
    // Find all partitions that contain matching keys
    String sql = "select " + aliasSpecA.toPartitionSelectSQL() + ", count(0)" + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " a join " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " b" + " on " + joinOnClause + " where b.processing_dttm = '" + feedPartitionValue + "'" + " group by " + aliasSpecA.toPartitionSelectSQL();
    try (final Statement st = conn.createStatement()) {
        logger.info("Selecting target partitions query [" + sql + "]");
        ResultSet rs = doSelectSQL(st, sql);
        v = toPartitionBatches(spec, rs);
    } catch (SQLException e) {
        logger.error("Failed to select partition batches SQL {} with error {}", sql, e);
        throw new RuntimeException("Failed to select partition batches", e);
    }
    return v;
}
Also used : SQLException(java.sql.SQLException) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Statement(java.sql.Statement) ResultSet(java.sql.ResultSet) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec)

Example 3 with PartitionSpec

use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.

the class TableMergeSyncSupport method generatePKMergePartitionQuery.

/**
 * Generates a query two merge two tables containing partitions on a primary key.
 *
 * @param selectFields       the list of fields in the select clause of the source table
 * @param partitionSpec      partition specification
 * @param sourceSchema       the name of the source table schema or database
 * @param sourceTable        the source table
 * @param targetSchema       the name of the target table schema or database
 * @param targetTable        the target table
 * @param feedPartitionValue the partition of the source table to use
 * @param columnSpecs        the column specifications
 * @return the sql
 */
protected String generatePKMergePartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) {
    // Include alias
    String selectSQL = StringUtils.join(selectFields, ",");
    String[] selectFieldsWithAlias = selectFieldsForAlias(selectFields, "a");
    String selectSQLWithAlias = StringUtils.join(selectFieldsWithAlias, ",");
    String joinOnClause = ColumnSpec.toPrimaryKeyJoinSQL(columnSpecs, "a", "b");
    String[] primaryKeys = ColumnSpec.toPrimaryKeys(columnSpecs);
    PartitionSpec partitionSpecWithAlias = partitionSpec.newForAlias("a");
    String anyPK = primaryKeys[0];
    List<PartitionBatch> batches = createPartitionBatchesforPKMerge(partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, joinOnClause);
    String targetPartitionWhereClause = targetPartitionsWhereClause(PartitionBatch.toPartitionBatchesForAlias(batches, "a"), false);
    // TODO: If the records matching the primary key between the source and target are in a different partition
    // AND the matching records are the only remaining records of the partition, then the following sql will fail to overwrite the
    // remaining record.  We need to detect this and then delete partition? This is a complex scenario..
    String sbSourceQuery = "select " + selectSQL + "," + partitionSpec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue);
    // First finds all records in valid
    // Second finds all records in target that should be preserved for impacted partitions
    StringBuilder sb = new StringBuilder();
    sb.append("insert overwrite table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ").append(partitionSpec.toDynamicPartitionSpec()).append("select ").append(selectSQL).append(",").append(partitionSpec.toPartitionSelectSQL()).append(" from (").append("  select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append("  from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" a").append("  where ").append("  a.processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)).append(" union all ").append("  select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append("  from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" a left outer join (").append(sbSourceQuery).append(") b ").append("  on (").append(joinOnClause).append(")").append("  where ").append("  (b.").append(anyPK).append(" is null)");
    if (targetPartitionWhereClause != null) {
        sb.append(" and (").append(targetPartitionWhereClause).append(")");
    }
    sb.append(") t");
    return sb.toString();
}
Also used : PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec)

Example 4 with PartitionSpec

use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.

the class TableMergeSyncSupportTest method doTestMergePK.

private void doTestMergePK(String targetSchema, String targetTable, PartitionSpec spec) {
    List<String> results = fetchEmployees(targetSchema, targetTable);
    assertEquals(1, results.size());
    ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
    ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
    ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(4, results.size());
    assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
    // Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'OLD'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'OLD'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(6, results.size());
    // Verify no duplicates exist in the table
    HashSet<String> existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'ABC'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (102,'Buddy'," + "'ABC'," + "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(7, results.size());
    existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
}
Also used : HiveShell(com.klarna.hiverunner.HiveShell) MapUtils(org.apache.commons.collections4.MapUtils) Arrays(java.util.Arrays) HiveRunnerConfig(com.klarna.hiverunner.config.HiveRunnerConfig) RunWith(org.junit.runner.RunWith) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) HashMap(java.util.HashMap) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HiveUtils(com.thinkbiganalytics.hive.util.HiveUtils) HashSet(java.util.HashSet) HiveProperties(com.klarna.hiverunner.annotations.HiveProperties) HiveSQL(com.klarna.hiverunner.annotations.HiveSQL) SQLException(java.sql.SQLException) List(java.util.List) Vector(java.util.Vector) HiveRunnerSetup(com.klarna.hiverunner.annotations.HiveRunnerSetup) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Assert.assertFalse(org.junit.Assert.assertFalse) Map(java.util.Map) StandaloneHiveRunner(com.klarna.hiverunner.StandaloneHiveRunner) Assert.assertEquals(org.junit.Assert.assertEquals) Before(org.junit.Before) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HashSet(java.util.HashSet)

Example 5 with PartitionSpec

use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.

the class TableMergeSyncSupportTest method testMergePKWithEmptyTargetTable.

@Test
public /**
 * Tests the merge with empty target table
 */
void testMergePKWithEmptyTargetTable() throws Exception {
    List<String> results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(0, results.size());
    ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
    ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
    ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(4, results.size());
    // Merge with same source should leave us with 4 records
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(4, results.size());
    // Should update 1 and add 1
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') (  `id`,  `timestamp`, `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values " + "(1,'1'," + "'NEW VALUE'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') (  `id`,  `timestamp`, `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values " + "(10010,'1'," + "'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), "20160119074350", columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(5, results.size());
}
Also used : ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) Test(org.junit.Test)

Aggregations

PartitionSpec (com.thinkbiganalytics.util.PartitionSpec)6 ColumnSpec (com.thinkbiganalytics.util.ColumnSpec)4 PartitionBatch (com.thinkbiganalytics.util.PartitionBatch)4 SQLException (java.sql.SQLException)3 Test (org.junit.Test)3 HiveShell (com.klarna.hiverunner.HiveShell)2 StandaloneHiveRunner (com.klarna.hiverunner.StandaloneHiveRunner)2 HiveProperties (com.klarna.hiverunner.annotations.HiveProperties)2 HiveRunnerSetup (com.klarna.hiverunner.annotations.HiveRunnerSetup)2 HiveSQL (com.klarna.hiverunner.annotations.HiveSQL)2 HiveRunnerConfig (com.klarna.hiverunner.config.HiveRunnerConfig)2 HiveUtils (com.thinkbiganalytics.hive.util.HiveUtils)2 Arrays (java.util.Arrays)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Vector (java.util.Vector)2 MapUtils (org.apache.commons.collections4.MapUtils)2 Assert.assertEquals (org.junit.Assert.assertEquals)2