Search in sources :

Example 1 with PartitionBatch

use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.

the class TableMergeSyncSupport method generatePKMergePartitionQuery.

/**
 * Generates a query two merge two tables containing partitions on a primary key.
 *
 * @param selectFields       the list of fields in the select clause of the source table
 * @param partitionSpec      partition specification
 * @param sourceSchema       the name of the source table schema or database
 * @param sourceTable        the source table
 * @param targetSchema       the name of the target table schema or database
 * @param targetTable        the target table
 * @param feedPartitionValue the partition of the source table to use
 * @param columnSpecs        the column specifications
 * @return the sql
 */
protected String generatePKMergePartitionQuery(@Nonnull final String[] selectFields, @Nonnull final PartitionSpec partitionSpec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final ColumnSpec[] columnSpecs) {
    // Include alias
    String selectSQL = StringUtils.join(selectFields, ",");
    String[] selectFieldsWithAlias = selectFieldsForAlias(selectFields, "a");
    String selectSQLWithAlias = StringUtils.join(selectFieldsWithAlias, ",");
    String joinOnClause = ColumnSpec.toPrimaryKeyJoinSQL(columnSpecs, "a", "b");
    String[] primaryKeys = ColumnSpec.toPrimaryKeys(columnSpecs);
    PartitionSpec partitionSpecWithAlias = partitionSpec.newForAlias("a");
    String anyPK = primaryKeys[0];
    List<PartitionBatch> batches = createPartitionBatchesforPKMerge(partitionSpec, sourceSchema, sourceTable, targetSchema, targetTable, feedPartitionValue, joinOnClause);
    String targetPartitionWhereClause = targetPartitionsWhereClause(PartitionBatch.toPartitionBatchesForAlias(batches, "a"), false);
    // TODO: If the records matching the primary key between the source and target are in a different partition
    // AND the matching records are the only remaining records of the partition, then the following sql will fail to overwrite the
    // remaining record.  We need to detect this and then delete partition? This is a complex scenario..
    String sbSourceQuery = "select " + selectSQL + "," + partitionSpec.toDynamicSelectSQLSpec() + " from " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " where processing_dttm = " + HiveUtils.quoteString(feedPartitionValue);
    // First finds all records in valid
    // Second finds all records in target that should be preserved for impacted partitions
    StringBuilder sb = new StringBuilder();
    sb.append("insert overwrite table ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" ").append(partitionSpec.toDynamicPartitionSpec()).append("select ").append(selectSQL).append(",").append(partitionSpec.toPartitionSelectSQL()).append(" from (").append("  select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append("  from ").append(HiveUtils.quoteIdentifier(sourceSchema, sourceTable)).append(" a").append("  where ").append("  a.processing_dttm = ").append(HiveUtils.quoteString(feedPartitionValue)).append(" union all ").append("  select ").append(selectSQLWithAlias).append(",").append(partitionSpecWithAlias.toDynamicSelectSQLSpec()).append("  from ").append(HiveUtils.quoteIdentifier(targetSchema, targetTable)).append(" a left outer join (").append(sbSourceQuery).append(") b ").append("  on (").append(joinOnClause).append(")").append("  where ").append("  (b.").append(anyPK).append(" is null)");
    if (targetPartitionWhereClause != null) {
        sb.append(" and (").append(targetPartitionWhereClause).append(")");
    }
    sb.append(") t");
    return sb.toString();
}
Also used : PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec)

Example 2 with PartitionBatch

use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.

the class TableMergeSyncSupport method createPartitionBatchesforPKMerge.

/**
 * Finds all partitions that contain matching keys.
 *
 * @param spec               the partition spec
 * @param sourceSchema       the name of the source table schema or database
 * @param sourceTable        the source table
 * @param targetSchema       the name of the target table schema or database
 * @param targetTable        the target table
 * @param feedPartitionValue the partition of the source table to use
 * @param joinOnClause       the JOIN clause for the source and target tables
 * @return the matching partitions
 */
protected List<PartitionBatch> createPartitionBatchesforPKMerge(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String targetSchema, @Nonnull final String targetTable, @Nonnull final String feedPartitionValue, @Nonnull final String joinOnClause) {
    List<PartitionBatch> v;
    PartitionSpec aliasSpecA = spec.newForAlias("a");
    // Find all partitions that contain matching keys
    String sql = "select " + aliasSpecA.toPartitionSelectSQL() + ", count(0)" + " from " + HiveUtils.quoteIdentifier(targetSchema, targetTable) + " a join " + HiveUtils.quoteIdentifier(sourceSchema, sourceTable) + " b" + " on " + joinOnClause + " where b.processing_dttm = '" + feedPartitionValue + "'" + " group by " + aliasSpecA.toPartitionSelectSQL();
    try (final Statement st = conn.createStatement()) {
        logger.info("Selecting target partitions query [" + sql + "]");
        ResultSet rs = doSelectSQL(st, sql);
        v = toPartitionBatches(spec, rs);
    } catch (SQLException e) {
        logger.error("Failed to select partition batches SQL {} with error {}", sql, e);
        throw new RuntimeException("Failed to select partition batches", e);
    }
    return v;
}
Also used : SQLException(java.sql.SQLException) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Statement(java.sql.Statement) ResultSet(java.sql.ResultSet) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec)

Example 3 with PartitionBatch

use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.

the class TableMergeSyncSupport method createPartitionBatches.

/**
 * Generates batches of partitions in the source table.
 *
 * @param spec          the partition specification
 * @param sourceSchema  the schema or database name of the source table
 * @param sourceTable   the source table name
 * @param feedPartition the source processing partition value
 */
protected List<PartitionBatch> createPartitionBatches(@Nonnull final PartitionSpec spec, @Nonnull final String sourceSchema, @Nonnull final String sourceTable, @Nonnull final String feedPartition) {
    List<PartitionBatch> v;
    String sql = "";
    try (final Statement st = conn.createStatement()) {
        sql = spec.toDistinctSelectSQL(sourceSchema, sourceTable, feedPartition);
        logger.info("Executing batch query [" + sql + "]");
        ResultSet rs = doSelectSQL(st, sql);
        v = toPartitionBatches(spec, rs);
    } catch (SQLException e) {
        logger.error("Failed to select partition batches SQL {} with error {}", sql, e);
        throw new RuntimeException("Failed to select partition batches", e);
    }
    return v;
}
Also used : SQLException(java.sql.SQLException) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Statement(java.sql.Statement) ResultSet(java.sql.ResultSet)

Example 4 with PartitionBatch

use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.

the class TableMergeSyncSupportTest method fetchPartitionBatches.

private List<PartitionBatch> fetchPartitionBatches() {
    List<PartitionBatch> vBatch = new Vector<>();
    String sql = spec.toDistinctSelectSQL(sourceSchema, sourceTable, processingPartition);
    List<Object[]> results = hiveShell.executeStatement(sql);
    for (Object[] vals : results) {
        vBatch.add(new PartitionBatch((Long) vals[2], spec, new String[] { vals[0].toString(), vals[1].toString() }));
    }
    return vBatch;
}
Also used : PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Vector(java.util.Vector)

Example 5 with PartitionBatch

use of com.thinkbiganalytics.util.PartitionBatch in project kylo by Teradata.

the class TableMergeSyncSupport method targetPartitionsWhereClause.

/**
 * Produces a where clause that limits to the impacted partitions of the target table
 *
 * @param batches          a list of partition batches
 * @param useSourceColumns a boolean value to decide whether to use the source columns or target columns
 * @return a where clause sql string
 */
private String targetPartitionsWhereClause(List<PartitionBatch> batches, boolean useSourceColumns) {
    List<String> targetPartitionsItems = new Vector<>();
    for (PartitionBatch batch : batches) {
        String column = useSourceColumns ? batch.getPartitionSpec().toSourceSQLWhere(batch.getPartitionValues()) : batch.getPartitionSpec().toTargetSQLWhere(batch.getPartitionValues());
        targetPartitionsItems.add("(" + column + ")");
    }
    return (targetPartitionsItems.size() == 0 ? null : StringUtils.join(targetPartitionsItems.toArray(new String[0]), " or "));
}
Also used : PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Vector(java.util.Vector)

Aggregations

PartitionBatch (com.thinkbiganalytics.util.PartitionBatch)6 Vector (java.util.Vector)3 PartitionSpec (com.thinkbiganalytics.util.PartitionSpec)2 ResultSet (java.sql.ResultSet)2 SQLException (java.sql.SQLException)2 Statement (java.sql.Statement)2