Search in sources :

Example 6 with PartitionSpec

use of com.thinkbiganalytics.util.PartitionSpec in project kylo by Teradata.

the class TableMergeSyncSupportTest method doTestMergePKWithDifferentPartitions.

/*
    Test ability to strip records that match the ID but are in a different partition than the newer record
     */
private void doTestMergePKWithDifferentPartitions(String targetSchema, String targetTable, PartitionSpec spec) {
    // Insert one record to start
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2012) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1,'1','Sally','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2012-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2012) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1002,'1','Jimbo','VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2012-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2015) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1000,'1','Jill','ORIG','94550'," + "'555-1212'," + "'sally@acme.org','2015-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (2,'1','Bill','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (3,'1','Ray','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1001,'1','Fred','VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
    List<String> results = fetchEmployees(targetSchema, targetTable);
    assertEquals(6, results.size());
    ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
    ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
    ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    // We should have 6 records
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(6, results.size());
    assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
    // Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'OLD'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'OLD'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(8, results.size());
    // Verify no duplicates exist in the table
    HashSet<String> existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'ABC'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (102,'Buddy'," + "'ABC'," + "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(9, results.size());
    existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
}
Also used : HiveShell(com.klarna.hiverunner.HiveShell) MapUtils(org.apache.commons.collections4.MapUtils) Arrays(java.util.Arrays) HiveRunnerConfig(com.klarna.hiverunner.config.HiveRunnerConfig) RunWith(org.junit.runner.RunWith) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) HashMap(java.util.HashMap) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HiveUtils(com.thinkbiganalytics.hive.util.HiveUtils) HashSet(java.util.HashSet) HiveProperties(com.klarna.hiverunner.annotations.HiveProperties) HiveSQL(com.klarna.hiverunner.annotations.HiveSQL) SQLException(java.sql.SQLException) List(java.util.List) Vector(java.util.Vector) HiveRunnerSetup(com.klarna.hiverunner.annotations.HiveRunnerSetup) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Assert.assertFalse(org.junit.Assert.assertFalse) Map(java.util.Map) StandaloneHiveRunner(com.klarna.hiverunner.StandaloneHiveRunner) Assert.assertEquals(org.junit.Assert.assertEquals) Before(org.junit.Before) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HashSet(java.util.HashSet)

Aggregations

PartitionSpec (com.thinkbiganalytics.util.PartitionSpec)6 ColumnSpec (com.thinkbiganalytics.util.ColumnSpec)4 PartitionBatch (com.thinkbiganalytics.util.PartitionBatch)4 SQLException (java.sql.SQLException)3 Test (org.junit.Test)3 HiveShell (com.klarna.hiverunner.HiveShell)2 StandaloneHiveRunner (com.klarna.hiverunner.StandaloneHiveRunner)2 HiveProperties (com.klarna.hiverunner.annotations.HiveProperties)2 HiveRunnerSetup (com.klarna.hiverunner.annotations.HiveRunnerSetup)2 HiveSQL (com.klarna.hiverunner.annotations.HiveSQL)2 HiveRunnerConfig (com.klarna.hiverunner.config.HiveRunnerConfig)2 HiveUtils (com.thinkbiganalytics.hive.util.HiveUtils)2 Arrays (java.util.Arrays)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Vector (java.util.Vector)2 MapUtils (org.apache.commons.collections4.MapUtils)2 Assert.assertEquals (org.junit.Assert.assertEquals)2