use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.
the class TableMergeSyncSupportTest method doTestMergePKWithDifferentPartitions.
/*
Test ability to strip records that match the ID but are in a different partition than the newer record
*/
private void doTestMergePKWithDifferentPartitions(String targetSchema, String targetTable, PartitionSpec spec) {
// Insert one record to start
hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2012) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1,'1','Sally','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2012-01-01');");
hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2012) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1002,'1','Jimbo','VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2012-01-01');");
hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2015) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1000,'1','Jill','ORIG','94550'," + "'555-1212'," + "'sally@acme.org','2015-01-01');");
hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (2,'1','Bill','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (3,'1','Ray','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) ( `id`, `timestamp`,`name`,`company`,`zip`,`phone`,`email`, `hired`) values (1001,'1','Fred','VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
List<String> results = fetchEmployees(targetSchema, targetTable);
assertEquals(6, results.size());
ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
// Call merge
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
// We should have 6 records
results = fetchEmployees(targetSchema, targetTable);
assertEquals(6, results.size());
assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
// Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce'," + "'OLD'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry'," + "'OLD'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(8, results.size());
// Verify no duplicates exist in the table
HashSet<String> existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (100,'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (101,'Harry'," + "'ABC'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') ( `id`, `name`,`company`,`zip`,`phone`,`email`, `hired`,`country`) values (102,'Buddy'," + "'ABC'," + "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
results = fetchEmployees(targetSchema, targetTable);
assertEquals(9, results.size());
existing = new HashSet<>();
for (String r : results) {
assertFalse(existing.contains(r));
existing.add(r);
}
assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
}
Aggregations