use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.
the class TableMergeSyncSupport method generatePKMergePartitionQuery.
/**
* Generates a query two merge two tables containing partitions on a primary key.
*
* @param selectFields the list of fields in the select clause of the source table
* @param partitionSpec partition specification
* @param sourceSchema the name of the source table schema or database
* @param sourceTable the source table
* @param targetSchema the name of the target table schema or database
* @param targetTable the target table
* @param feedPartitionValue the partition of the source table to use
* @param columnSpecs the column specifications
* @return the sql
*/
protected String generatePKMergePartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) {
// Include alias
String selectSQL = StringUtils.join(selectFields, ",");
String[] selectFieldsWithAlias = selectFieldsForAlias(selectFields, "a");
String selectSQLWithAlias = StringUtils.join(selectFieldsWithAlias, ",");
String joinOnClause = ColumnSpec.toPrimaryKeyJoinSQL(columnSpecs, "a", "b");
String[] primaryKeys = ColumnSpec.toPrimaryKeys(columnSpecs);
PartitionSpec partitionSpecWithAlias = partitionSpec.newForAlias("a");
String anyPK = primaryKeys[0];
List<PartitionBatch> batches = createPartitionBatchesforPKMerge(partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, joinOnClause);
String targetPartitionWhereClause = targetPartitionsWhereClause(PartitionBatch.toPartitionBatchesForAlias(batches, "a"), false);
// TODO: If the records matching the primary key between the source and target are in a different partition
// AND the matching records are the only remaining records of the partition, then the following sql will fail to overwrite the
// remaining record. We need to detect this and then delete partition? This is a complex scenario..
String sbSourceQuery = "select " + selectSQL + "," + partitionSpec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue);
// First finds all records in valid
// Second finds all records in target that should be preserved for impacted partitions
StringBuilder sb = new StringBuilder();
sb.append("insert overwrite table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ").append(partitionSpec.toDynamicPartitionSpec()).append("select ").append(selectSQL).append(",").append(partitionSpec.toPartitionSelectSQL()).append(" from (").append(" select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append(" from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" a").append(" where ").append(" a.processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)).append(" union all ").append(" select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append(" from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" a left outer join (").append(sbSourceQuery).append(") b ").append(" on (").append(joinOnClause).append(")").append(" where ").append(" (b.").append(anyPK).append(" is null)");
if (targetPartitionWhereClause != null) {
sb.append(" and (").append(targetPartitionWhereClause).append(")");
}
sb.append(") t");
return sb.toString();
}
use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.
the class TableMergeSyncSupport method createPartitionBatchesforPKMerge.
/**
* Finds all partitions that contain matching keys.
*
* @param spec the partition spec
* @param sourceSchema the name of the source table schema or database
* @param sourceTable the source table
* @param targetSchema the name of the target table schema or database
* @param targetTable the target table
* @param feedPartitionValue the partition of the source table to use
* @param joinOnClause the JOIN clause for the source and target tables
* @return the matching partitions
*/
protected List<PartitionBatch> createPartitionBatchesforPKMerge(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final String joinOnClause) {
List<PartitionBatch> v;
PartitionSpec aliasSpecA = spec.newForAlias("a");
// Find all partitions that contain matching keys
String sql = "select " + aliasSpecA.toPartitionSelectSQL() + ", count(0)" + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " a join " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " b" + " on " + joinOnClause + " where b.processing_dttm = '" + feedPartitionValue + "'" + " group by " + aliasSpecA.toPartitionSelectSQL();
try (final Statement st = conn.createStatement()) {
logger.info("Selecting target partitions query [" + sql + "]");
ResultSet rs = doSelectSQL(st, sql);
v = toPartitionBatches(spec, rs);
} catch (SQLException e) {
logger.error("Failed to select partition batches SQL {} with error {}", sql, e);
throw new RuntimeException("Failed to select partition batches", e);
}
return v;
}
use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.
the class TableMergeSyncSupport method createPartitionBatches.
/**
* Generates batches of partitions in the source table.
*
* @param spec the partition specification
* @param sourceSchema the schema or database name of the source table
* @param sourceTable the source table name
* @param feedPartition the source processing partition value
*/
protected List<PartitionBatch> createPartitionBatches(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String feedPartition) {
List<PartitionBatch> v;
String sql = "";
try (final Statement st = conn.createStatement()) {
sql = spec.toDistinctSelectSQL(sourceSchema, sourceTable, feedPartition);
logger.info("Executing batch query [" + sql + "]");
ResultSet rs = doSelectSQL(st, sql);
v = toPartitionBatches(spec, rs);
} catch (SQLException e) {
logger.error("Failed to select partition batches SQL {} with error {}", sql, e);
throw new RuntimeException("Failed to select partition batches", e);
}
return v;
}
use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.
the class TableMergeSyncSupportTest method fetchPartitionBatches.
private List<PartitionBatch> fetchPartitionBatches() {
List<PartitionBatch> vBatch = new Vector<>();
String sql = spec.toDistinctSelectSQL(sourceSchema, sourceTable, processingPartition);
List<Object[]> results = hiveShell.executeStatement(sql);
for (Object[] vals : results) {
vBatch.add(new PartitionBatch((Long) vals[2], spec, new String[] { vals[0].toString(), vals[1].toString() }));
}
return vBatch;
}
use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.
the class TableMergeSyncSupport method targetPartitionsWhereClause.
/**
* Produces a where clause that limits to the impacted partitions of the target table
*
* @param batches a list of partition batches
* @param useSourceColumns a boolean value to decide whether to use the source columns or target columns
* @return a where clause sql string
*/
private String targetPartitionsWhereClause(List<PartitionBatch> batches, boolean useSourceColumns) {
List<String> targetPartitionsItems = new Vector<>();
for (PartitionBatch batch : batches) {
String column = useSourceColumns ? batch.getPartitionSpec().toSourceSQLWhere(batch.getPartitionValues()) : batch.getPartitionSpec().toTargetSQLWhere(batch.getPartitionValues());
targetPartitionsItems.add("(" + column + ")");
}
return (targetPartitionsItems.size() == 0 ? null : StringUtils.join(targetPartitionsItems.toArray(new String[0]), " or "));
}
Aggregations