Search in sources :

Example 6 with ColumnSpec

use of com.thinkbiganalytics.util.ColumnSpec in project kylo by Teradata.

the class CreateElasticsearchBackedHiveTableTest method testGetHQLStatements.

@Test
public void testGetHQLStatements() throws Exception {
    ColumnSpec spec = new ColumnSpec("name", "string", "");
    ColumnSpec spec2 = new ColumnSpec("iD", "int", "");
    ColumnSpec spec3 = new ColumnSpec("PHONE", "string", "");
    ColumnSpec[] specs = { spec, spec2, spec3 };
    List<String> statements = table.getHQLStatements(specs, NODES, FEED_ROOT, FEED, CATEGORY, "true", "true", "", JAR_URL, FIELD_STRING);
    assertEquals("ADD JAR " + JAR_URL, statements.get(0));
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS " + CATEGORY + "." + FEED + "_index (`name` string, `phone` string, processing_dttm string, kylo_schema string, kylo_table string) " + "STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' " + "LOCATION '" + LOCATION + "' " + "TBLPROPERTIES('es.resource' = 'kylo-data/hive-data', 'es.nodes' = '" + NODES + "', 'es.nodes.wan.only' = 'true', 'es.index.auto.create' = 'true')", statements.get(1));
}
Also used : ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) Test(org.junit.Test)

Example 7 with ColumnSpec

use of com.thinkbiganalytics.util.ColumnSpec in project kylo by Teradata.

the class TableMergeSyncSupportTest method doTestMergePK.

private void doTestMergePK(String targetSchema, String targetTable, PartitionSpec spec) {
    List<String> results = fetchEmployees(targetSchema, targetTable);
    assertEquals(1, results.size());
    ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
    ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
    ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(4, results.size());
    assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
    // Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'OLD'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'OLD'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(6, results.size());
    // Verify no duplicates exist in the table
    HashSet<String> existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'ABC'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (102,'Buddy'," + "'ABC'," + "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(7, results.size());
    existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    assertFalse("Should not have old valur", results.stream().anyMatch(s -> s.contains("OLD")));
}
Also used : HiveShell(com.klarna.hiverunner.HiveShell) MapUtils(org.apache.commons.collections4.MapUtils) Arrays(java.util.Arrays) HiveRunnerConfig(com.klarna.hiverunner.config.HiveRunnerConfig) RunWith(org.junit.runner.RunWith) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) HashMap(java.util.HashMap) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HiveUtils(com.thinkbiganalytics.hive.util.HiveUtils) HashSet(java.util.HashSet) HiveProperties(com.klarna.hiverunner.annotations.HiveProperties) HiveSQL(com.klarna.hiverunner.annotations.HiveSQL) SQLException(java.sql.SQLException) List(java.util.List) Vector(java.util.Vector) HiveRunnerSetup(com.klarna.hiverunner.annotations.HiveRunnerSetup) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Assert.assertFalse(org.junit.Assert.assertFalse) Map(java.util.Map) StandaloneHiveRunner(com.klarna.hiverunner.StandaloneHiveRunner) Assert.assertEquals(org.junit.Assert.assertEquals) Before(org.junit.Before) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HashSet(java.util.HashSet)

Example 8 with ColumnSpec

use of com.thinkbiganalytics.util.ColumnSpec in project kylo by Teradata.

the class TableMergeSyncSupportTest method testMergePKWithEmptyTargetTable.

@Test
public /**
 * Tests the merge with empty target table
 */
void testMergePKWithEmptyTargetTable() throws Exception {
    List<String> results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(0, results.size());
    ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
    ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
    ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(4, results.size());
    // Merge with same source should leave us with 4 records
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), processingPartition, columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(4, results.size());
    // Should update 1 and add 1
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') (  `id`,  `timestamp`, `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values " + "(1,'1'," + "'NEW VALUE'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074350') (  `id`,  `timestamp`, `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values " + "(10010,'1'," + "'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTableNP, new PartitionSpec(), "20160119074350", columnSpecs);
    // We should have 4 records
    results = fetchEmployees(targetSchema, targetTableNP);
    assertEquals(5, results.size());
}
Also used : ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) Test(org.junit.Test)

Example 9 with ColumnSpec

use of com.thinkbiganalytics.util.ColumnSpec in project kylo by Teradata.

the class TableRegisterSupportTest method testTableCreateS3.

@Test
public void testTableCreateS3() {
    ColumnSpec[] specs = ColumnSpec.createFromString("id|bigint|my comment\nname|string\ncompany|string|some description\nzip|string\nphone|string\nemail|string\ncountry|string\nhired|date");
    ColumnSpec[] parts = ColumnSpec.createFromString("year|int\ncountry|string");
    TableRegisterConfiguration conf = new TableRegisterConfiguration("s3a://testBucket/model.db/", "s3a://testBucket/model.db/", "s3a://testBucket/app/warehouse/");
    TableRegisterSupport support = new TableRegisterSupport(connection, conf);
    TableType[] tableTypes = new TableType[] { TableType.FEED, TableType.INVALID, TableType.VALID, TableType.MASTER };
    for (TableType tableType : tableTypes) {
        String ddl = support.createDDL("bar", "employee", specs, parts, "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'", "stored as orc", "tblproperties (\"orc.compress\"=\"SNAPPY\")", tableType);
        String location = StringUtils.substringBetween(ddl, "LOCATION '", "'");
        if (tableType == TableType.MASTER) {
            assertEquals("Master location does not match", "s3a://testBucket/app/warehouse/bar/employee", location);
        } else {
            assertEquals("Locations do not match", "s3a://testBucket/model.db/bar/employee/" + tableType.toString().toLowerCase(), location);
        }
    }
}
Also used : ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) TableType(com.thinkbiganalytics.util.TableType) TableRegisterConfiguration(com.thinkbiganalytics.util.TableRegisterConfiguration) Test(org.junit.Test)

Example 10 with ColumnSpec

use of com.thinkbiganalytics.util.ColumnSpec in project kylo by Teradata.

the class TableMergeSyncSupportTest method doTestMergePKWithDifferentPartitions.

/*
    Test ability to strip records that match the ID but are in a different partition than the newer record
     */
private void doTestMergePKWithDifferentPartitions(String targetSchema, String targetTable, PartitionSpec spec) {
    // Insert one record to start
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2012) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1,'1','Sally','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2012-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2012) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1002,'1','Jimbo','VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2012-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2015) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1000,'1','Jill','ORIG','94550'," + "'555-1212'," + "'sally@acme.org','2015-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (2,'1','Bill','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (3,'1','Ray','OLD VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
    hiveShell.execute("insert into emp_sr.employee partition(country='USA',year=2013) (  `id`,  `timestamp`,`name`,`company`,`zip`,`phone`,`email`,  `hired`)  values (1001,'1','Fred','VALUE','94550'," + "'555-1212'," + "'sally@acme.org','2013-01-01');");
    List<String> results = fetchEmployees(targetSchema, targetTable);
    assertEquals(6, results.size());
    ColumnSpec columnSpec1 = new ColumnSpec("id", "String", "", true, false, false);
    ColumnSpec columnSpec2 = new ColumnSpec("name", "String", "", false, false, false);
    ColumnSpec[] columnSpecs = Arrays.asList(columnSpec1, columnSpec2).toArray(new ColumnSpec[0]);
    // Call merge
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    // We should have 6 records
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(6, results.size());
    assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
    // Run merge with dedupe and should get the following two additional results. The result should not include any duplicates in the target table.
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'OLD'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074340') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'OLD'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, processingPartition, columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(8, results.size());
    // Verify no duplicates exist in the table
    HashSet<String> existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (100,'Bruce'," + "'ABC'," + "'94550','555-1212','bruce@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (101,'Harry'," + "'ABC'," + "'94550','555-1212','harry@acme.org','2016-01-01','Canada');");
    hiveShell.execute("insert into emp_sr.employee_valid partition(processing_dttm='20160119074540') (  `id`,  `name`,`company`,`zip`,`phone`,`email`,  `hired`,`country`) values (102,'Buddy'," + "'ABC'," + "'94550','555-1212','buddy@acme.org','2016-01-01','Canada');");
    mergeSyncSupport.doPKMerge(sourceSchema, sourceTable, targetSchema, targetTable, spec, "20160119074540", columnSpecs);
    results = fetchEmployees(targetSchema, targetTable);
    assertEquals(9, results.size());
    existing = new HashSet<>();
    for (String r : results) {
        assertFalse(existing.contains(r));
        existing.add(r);
    }
    assertFalse("Should not have old value", results.stream().anyMatch(s -> s.contains("OLD")));
}
Also used : HiveShell(com.klarna.hiverunner.HiveShell) MapUtils(org.apache.commons.collections4.MapUtils) Arrays(java.util.Arrays) HiveRunnerConfig(com.klarna.hiverunner.config.HiveRunnerConfig) RunWith(org.junit.runner.RunWith) PartitionSpec(com.thinkbiganalytics.util.PartitionSpec) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) HashMap(java.util.HashMap) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HiveUtils(com.thinkbiganalytics.hive.util.HiveUtils) HashSet(java.util.HashSet) HiveProperties(com.klarna.hiverunner.annotations.HiveProperties) HiveSQL(com.klarna.hiverunner.annotations.HiveSQL) SQLException(java.sql.SQLException) List(java.util.List) Vector(java.util.Vector) HiveRunnerSetup(com.klarna.hiverunner.annotations.HiveRunnerSetup) PartitionBatch(com.thinkbiganalytics.util.PartitionBatch) Assert.assertFalse(org.junit.Assert.assertFalse) Map(java.util.Map) StandaloneHiveRunner(com.klarna.hiverunner.StandaloneHiveRunner) Assert.assertEquals(org.junit.Assert.assertEquals) Before(org.junit.Before) ColumnSpec(com.thinkbiganalytics.util.ColumnSpec) HashSet(java.util.HashSet)

Aggregations

ColumnSpec (com.thinkbiganalytics.util.ColumnSpec)14 Test (org.junit.Test)9 TableRegisterConfiguration (com.thinkbiganalytics.util.TableRegisterConfiguration)5 TableType (com.thinkbiganalytics.util.TableType)5 StringUtils (org.apache.commons.lang3.StringUtils)5 PartitionSpec (com.thinkbiganalytics.util.PartitionSpec)4 SQLException (java.sql.SQLException)4 FlowFile (org.apache.nifi.flowfile.FlowFile)4 HiveUtils (com.thinkbiganalytics.hive.util.HiveUtils)3 ThriftService (com.thinkbiganalytics.nifi.v2.thrift.ThriftService)3 Connection (java.sql.Connection)3 Arrays (java.util.Arrays)3 HashSet (java.util.HashSet)3 List (java.util.List)3 ProcessException (org.apache.nifi.processor.exception.ProcessException)3 HiveShell (com.klarna.hiverunner.HiveShell)2 StandaloneHiveRunner (com.klarna.hiverunner.StandaloneHiveRunner)2 HiveProperties (com.klarna.hiverunner.annotations.HiveProperties)2 HiveRunnerSetup (com.klarna.hiverunner.annotations.HiveRunnerSetup)2 HiveSQL (com.klarna.hiverunner.annotations.HiveSQL)2