use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.
the class AbstractMergeTable method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
final String blockingValue = context.getProperty(BLOCKING_KEY).evaluateAttributeExpressions(flowFile).getValue();
String flowFileId = flowFile.getAttribute(CoreAttributes.UUID.key());
boolean block = false;
if (blocking && blockingCache.putIfAbsent(blockingValue, flowFileId) != null) {
if (StringUtils.isBlank(flowFile.getAttribute(BLOCKED_START_TIME))) {
flowFile = session.putAttribute(flowFile, BLOCKED_START_TIME, String.valueOf(System.currentTimeMillis()));
getLogger().info("Transferring Flow file {} to blocked relationship", new Object[] { flowFile });
}
// penalize the flow file and transfer to BLOCKED
flowFile = session.penalize(flowFile);
session.transfer(flowFile, REL_BLOCKED);
return;
}
// Add Blocking time to flow file if this was a blocked flowfile.
if (blocking && StringUtils.isNotBlank(flowFile.getAttribute(BLOCKED_START_TIME))) {
String blockedStartTime = flowFile.getAttribute(BLOCKED_START_TIME);
try {
Long l = Long.parseLong(blockedStartTime);
Long blockTime = System.currentTimeMillis() - l;
getLogger().info("Processing Blocked flow file {}. This was blocked for {} ms", new Object[] { flowFile, blockTime });
flowFile = session.putAttribute(flowFile, BLOCKED_TIME, String.valueOf(blockTime) + " ms");
} catch (NumberFormatException e) {
}
}
String PROVENANCE_EXECUTION_STATUS_KEY = context.getName() + " Execution Status";
String partitionSpecString = context.getProperty(PARTITION_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue();
String sourceSchema = context.getProperty(SOURCE_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
String sourceTable = context.getProperty(SOURCE_TABLE).evaluateAttributeExpressions(flowFile).getValue();
String targetSchema = context.getProperty(TARGET_SCHEMA).evaluateAttributeExpressions(flowFile).getValue();
String targetTable = context.getProperty(TARGET_TABLE).evaluateAttributeExpressions(flowFile).getValue();
String feedPartitionValue = context.getProperty(FEED_PARTITION).evaluateAttributeExpressions(flowFile).getValue();
String mergeStrategyValue = context.getProperty(MERGE_STRATEGY).evaluateAttributeExpressions(flowFile).getValue();
String hiveConfigurations = context.getProperty(HIVE_CONFIGURATIONS).evaluateAttributeExpressions(flowFile).getValue();
boolean resetHive = context.getProperty(RESET_HIVE).asBoolean();
final ColumnSpec[] columnSpecs = Optional.ofNullable(context.getProperty(FIELD_SPECIFICATION).evaluateAttributeExpressions(flowFile).getValue()).filter(StringUtils::isNotEmpty).map(ColumnSpec::createFromString).orElse(new ColumnSpec[0]);
if (STRATEGY_PK_MERGE.equals(mergeStrategyValue) && (columnSpecs == null || columnSpecs.length == 0)) {
getLog().error("Missing required field specification for PK merge feature");
flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: Missing required field specification for PK merge feature");
release(blockingValue);
session.transfer(flowFile, IngestProperties.REL_FAILURE);
return;
}
// Maintain default for backward compatibility
if (StringUtils.isEmpty(mergeStrategyValue)) {
mergeStrategyValue = STRATEGY_DEDUPE_MERGE;
}
logger.info("Merge strategy: " + mergeStrategyValue + " Using Source: " + sourceTable + " Target: " + targetTable + " feed partition:" + feedPartitionValue + " partSpec: " + partitionSpecString);
final StopWatch stopWatch = new StopWatch(true);
try (final Connection conn = getConnection(context)) {
TableMergeSyncSupport mergeSupport = new TableMergeSyncSupport(conn);
if (resetHive) {
mergeSupport.resetHiveConf();
}
mergeSupport.enableDynamicPartitions();
if (StringUtils.isNotEmpty(hiveConfigurations)) {
mergeSupport.setHiveConf(hiveConfigurations.split("\\|"));
}
PartitionSpec partitionSpec = new PartitionSpec(partitionSpecString);
if (STRATEGY_DEDUPE_MERGE.equals(mergeStrategyValue)) {
mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, true);
} else if (STRATEGY_MERGE.equals(mergeStrategyValue)) {
mergeSupport.doMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, false);
} else if (STRATEGY_SYNC.equals(mergeStrategyValue)) {
mergeSupport.doSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue);
} else if (STRATEGY_ROLLING_SYNC.equals(mergeStrategyValue)) {
mergeSupport.doRollingSync(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue);
} else if (STRATEGY_PK_MERGE.equals(mergeStrategyValue)) {
mergeSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, partitionSpec, feedPartitionValue, columnSpecs);
} else {
throw new UnsupportedOperationException("Failed to resolve the merge strategy");
}
session.getProvenanceReporter().modifyContent(flowFile, "Execution completed", stopWatch.getElapsed(TimeUnit.MILLISECONDS));
flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Successful");
release(blockingValue);
logger.info("Execution completed: " + stopWatch.getElapsed(TimeUnit.MILLISECONDS) + " Merge strategy: " + mergeStrategyValue + " Using Source: " + sourceTable + " Target: " + targetTable + " feed partition:" + feedPartitionValue + " partSpec: " + partitionSpecString);
session.transfer(flowFile, REL_SUCCESS);
} catch (final Exception e) {
logger.error("Unable to execute merge doMerge for {} due to {}; routing to failure", new Object[] { flowFile, e }, e);
flowFile = session.putAttribute(flowFile, PROVENANCE_EXECUTION_STATUS_KEY, "Failed: " + e.getMessage());
release(blockingValue);
session.transfer(flowFile, REL_FAILURE);
}
}
use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.
the class TableMergeSyncSupport method createPartitionBatchesforPKMerge.
/**
* Finds all partitions that contain matching keys.
*
* @param spec the partition spec
* @param sourceSchema the name of the source table schema or database
* @param sourceTable the source table
* @param targetSchema the name of the target table schema or database
* @param targetTable the target table
* @param feedPartitionValue the partition of the source table to use
* @param joinOnClause the JOIN clause for the source and target tables
* @return the matching partitions
*/
protected List<PartitionBatch> createPartitionBatchesforPKMerge(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final String joinOnClause) {
List<PartitionBatch> v;
PartitionSpec aliasSpecA = spec.newForAlias("a");
// Find all partitions that contain matching keys
String sql = "select " + aliasSpecA.toPartitionSelectSQL() + ", count(0)" + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " a join " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " b" + " on " + joinOnClause + " where b.processing_dttm = '" + feedPartitionValue + "'" + " group by " + aliasSpecA.toPartitionSelectSQL();
try (final Statement st = conn.createStatement()) {
logger.info("Selecting target partitions query [" + sql + "]");
ResultSet rs = doSelectSQL(st, sql);
v = toPartitionBatches(spec, rs);
} catch (SQLException e) {
logger.error("Failed to select partition batches SQL {} with error {}", sql, e);
throw new RuntimeException("Failed to select partition batches", e);
}
return v;
}
use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.
the class TableMergeSyncSupport method generatePKMergePartitionQuery.
/**
* Generates a query two merge two tables containing partitions on a primary key.
*
* @param selectFields the list of fields in the select clause of the source table
* @param partitionSpec partition specification
* @param sourceSchema the name of the source table schema or database
* @param sourceTable the source table
* @param targetSchema the name of the target table schema or database
* @param targetTable the target table
* @param feedPartitionValue the partition of the source table to use
* @param columnSpecs the column specifications
* @return the sql
*/
protected String generatePKMergePartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) {
// Include alias
String selectSQL = StringUtils.join(selectFields, ",");
String[] selectFieldsWithAlias = selectFieldsForAlias(selectFields, "a");
String selectSQLWithAlias = StringUtils.join(selectFieldsWithAlias, ",");
String joinOnClause = ColumnSpec.toPrimaryKeyJoinSQL(columnSpecs, "a", "b");
String[] primaryKeys = ColumnSpec.toPrimaryKeys(columnSpecs);
PartitionSpec partitionSpecWithAlias = partitionSpec.newForAlias("a");
String anyPK = primaryKeys[0];
List<PartitionBatch> batches = createPartitionBatchesforPKMerge(partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, joinOnClause);
String targetPartitionWhereClause = targetPartitionsWhereClause(PartitionBatch.toPartitionBatchesForAlias(batches, "a"), false);
// TODO: If the records matching the primary key between the source and target are in a different partition
// AND the matching records are the only remaining records of the partition, then the following sql will fail to overwrite the
// remaining record. We need to detect this and then delete partition? This is a complex scenario..
String sbSourceQuery = "select " + selectSQL + "," + partitionSpec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue);
// First finds all records in valid
// Second finds all records in target that should be preserved for impacted partitions
StringBuilder sb = new StringBuilder();
sb.append("insert overwrite table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ").append(partitionSpec.toDynamicPartitionSpec()).append("select ").append(selectSQL).append(",").append(partitionSpec.toPartitionSelectSQL()).append(" from (").append(" select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append(" from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" a").append(" where ").append(" a.processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)).append(" union all ").append(" select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append(" from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" a left outer join (").append(sbSourceQuery).append(") b ").append(" on (").append(joinOnClause).append(")").append(" where ").append(" (b.").append(anyPK).append(" is null)");
if (targetPartitionWhereClause != null) {
sb.append(" and (").append(targetPartitionWhereClause).append(")");
}
sb.append(") t");
return sb.toString();
}
use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.
the class TableMergeSyncSupportTest method doTestMergePK.
private void doTestMergePK(String targetSchema, String targetTable, PartitionSpec spec) {
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(1, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTable);
assertEquals(4, results.size());
assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
// Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce'," + "'OLD'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry'," + "'OLD'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(6, results.size());
// Verify no duplicates exist in the table
HashSet<String> existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry'," + "'ABC'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (102,'Buddy'," + "'ABC'," + "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(7, results.size());
existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
}
use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.
the class TableMergeSyncSupportTest method testMergePKWithEmptyTargetTable.
@Test
public /**
* Tests the merge with empty target table
*/
void testMergePKWithEmptyTargetTable() throws Exception {
List<String> results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(0, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(4, results.size());
// Merge with same source should leave us with 4 records
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(4, results.size());
// Should update 1 and add 1
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values " + "(1,'1'," + "'NEW VALUE'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') ( `id`, `timestamp`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values " + "(10010,'1'," + "'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), "20160119074350", columnSpecs);
// We should have 4 records
results = fetchEmployees(targetSchema, targetTableNP);
assertEquals(5, results.size());
}
Aggregations