Search in sources :

Example 1 with TestTxnCommands2.runWorker

use of org.apache.hadoop.hive.ql.TestTxnCommands2.runWorker in project hive by apache.

the class TestCompactor method testCompactionOnDataLoadedInPath.

/**
 * Tests compaction of tables that were populated by LOAD DATA INPATH statements.
 *
 * In this scenario original ORC files are a structured in the following way:
 * comp3
 * |--delta_0000001_0000001_0000
 *    |--000000_0
 * |--delta_0000002_0000002_0000
 *    |--000000_0
 *    |--000001_0
 *
 * ..where comp3 table is not bucketed.
 *
 * @throws Exception
 */
@Test
public void testCompactionOnDataLoadedInPath() throws Exception {
    // Setup of LOAD INPATH scenario.
    executeStatementOnDriver("drop table if exists comp0", driver);
    executeStatementOnDriver("drop table if exists comp1", driver);
    executeStatementOnDriver("drop table if exists comp3", driver);
    executeStatementOnDriver("create external table comp0 (a string)", driver);
    executeStatementOnDriver("insert into comp0 values ('1111111111111')", driver);
    executeStatementOnDriver("insert into comp0 values ('2222222222222')", driver);
    executeStatementOnDriver("insert into comp0 values ('3333333333333')", driver);
    executeStatementOnDriver("create external table comp1 stored as orc as select * from comp0", driver);
    executeStatementOnDriver("create table comp3 (a string) stored as orc " + "TBLPROPERTIES ('transactional'='true')", driver);
    IMetaStoreClient hmsClient = new HiveMetaStoreClient(conf);
    Table table = hmsClient.getTable("default", "comp1");
    FileSystem fs = FileSystem.get(conf);
    Path path000 = fs.listStatus(new Path(table.getSd().getLocation()))[0].getPath();
    Path path001 = new Path(path000.toString().replace("000000", "000001"));
    Path path002 = new Path(path000.toString().replace("000000", "000002"));
    fs.copyFromLocalFile(path000, path001);
    fs.copyFromLocalFile(path000, path002);
    executeStatementOnDriver("load data inpath '" + path002.toString() + "' into table comp3", driver);
    executeStatementOnDriver("load data inpath '" + path002.getParent().toString() + "' into table comp3", driver);
    // Run compaction.
    TxnStore txnHandler = TxnUtils.getTxnStore(conf);
    CompactionRequest rqst = new CompactionRequest("default", "comp3", CompactionType.MAJOR);
    txnHandler.compact(rqst);
    runWorker(conf);
    ShowCompactRequest scRqst = new ShowCompactRequest();
    List<ShowCompactResponseElement> compacts = txnHandler.showCompact(scRqst).getCompacts();
    assertEquals(1, compacts.size());
    assertEquals(TxnStore.CLEANING_RESPONSE, compacts.get(0).getState());
    runCleaner(conf);
    compacts = txnHandler.showCompact(scRqst).getCompacts();
    assertEquals(1, compacts.size());
    assertEquals(TxnStore.SUCCEEDED_RESPONSE, compacts.get(0).getState());
    // Check compacted content and file structure.
    table = hmsClient.getTable("default", "comp3");
    List<String> rs = execSelectAndDumpData("select * from comp3", driver, "select");
    assertEquals(9, rs.size());
    assertEquals(3, rs.stream().filter(p -> "1111111111111".equals(p)).count());
    assertEquals(3, rs.stream().filter(p -> "2222222222222".equals(p)).count());
    assertEquals(3, rs.stream().filter(p -> "3333333333333".equals(p)).count());
    FileStatus[] files = fs.listStatus(new Path(table.getSd().getLocation()));
    // base dir
    assertEquals(1, files.length);
    assertEquals("base_0000002_v0000012", files[0].getPath().getName());
    files = fs.listStatus(files[0].getPath(), AcidUtils.bucketFileFilter);
    // files
    assertEquals(2, files.length);
    Arrays.stream(files).filter(p -> "bucket_00000".equals(p.getPath().getName())).count();
    Arrays.stream(files).filter(p -> "bucket_00001".equals(p.getPath().getName())).count();
    // Another insert into the newly compacted table.
    executeStatementOnDriver("insert into comp3 values ('4444444444444')", driver);
    // Compact with extra row too.
    txnHandler.compact(rqst);
    runWorker(conf);
    compacts = txnHandler.showCompact(scRqst).getCompacts();
    assertEquals(2, compacts.size());
    assertEquals(TxnStore.CLEANING_RESPONSE, compacts.get(0).getState());
    runCleaner(conf);
    compacts = txnHandler.showCompact(scRqst).getCompacts();
    assertEquals(2, compacts.size());
    assertEquals(TxnStore.SUCCEEDED_RESPONSE, compacts.get(0).getState());
    // Check compacted content and file structure.
    rs = execSelectAndDumpData("select * from comp3", driver, "select");
    assertEquals(10, rs.size());
    assertEquals(3, rs.stream().filter(p -> "1111111111111".equals(p)).count());
    assertEquals(3, rs.stream().filter(p -> "2222222222222".equals(p)).count());
    assertEquals(3, rs.stream().filter(p -> "3333333333333".equals(p)).count());
    assertEquals(1, rs.stream().filter(p -> "4444444444444".equals(p)).count());
    files = fs.listStatus(new Path(table.getSd().getLocation()));
    // base dir
    assertEquals(1, files.length);
    assertEquals("base_0000004_v0000016", files[0].getPath().getName());
    files = fs.listStatus(files[0].getPath(), AcidUtils.bucketFileFilter);
    // files
    assertEquals(2, files.length);
    Arrays.stream(files).filter(p -> "bucket_00000".equals(p.getPath().getName())).count();
    Arrays.stream(files).filter(p -> "bucket_00001".equals(p.getPath().getName())).count();
}
Also used : Path(org.apache.hadoop.fs.Path) DriverFactory(org.apache.hadoop.hive.ql.DriverFactory) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) CompactionRequest(org.apache.hadoop.hive.metastore.api.CompactionRequest) Arrays(java.util.Arrays) SortedSet(java.util.SortedSet) TestTxnDbUtil(org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil) StreamingConnection(org.apache.hive.streaming.StreamingConnection) FileSystem(org.apache.hadoop.fs.FileSystem) HiveStreamingConnection(org.apache.hive.streaming.HiveStreamingConnection) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) Random(java.util.Random) FileStatus(org.apache.hadoop.fs.FileStatus) CompactionType(org.apache.hadoop.hive.metastore.api.CompactionType) TestTxnCommands2.runWorker(org.apache.hadoop.hive.ql.TestTxnCommands2.runWorker) Mockito.doThrow(org.mockito.Mockito.doThrow) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OrcConf(org.apache.orc.OrcConf) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Map(java.util.Map) After(org.junit.After) Path(org.apache.hadoop.fs.Path) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) ShowCompactResponse(org.apache.hadoop.hive.metastore.api.ShowCompactResponse) FileUtil(org.apache.hadoop.fs.FileUtil) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) IDriver(org.apache.hadoop.hive.ql.IDriver) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) SessionState(org.apache.hadoop.hive.ql.session.SessionState) Retry(org.apache.hive.common.util.Retry) TxnUtils(org.apache.hadoop.hive.metastore.txn.TxnUtils) List(java.util.List) HCatUtil(org.apache.hive.hcatalog.common.HCatUtil) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) StrictDelimitedInputWriter(org.apache.hive.streaming.StrictDelimitedInputWriter) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) VISIBILITY_PATTERN(org.apache.hadoop.hive.common.AcidConstants.VISIBILITY_PATTERN) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) TestTxnCommands2.runCleaner(org.apache.hadoop.hive.ql.TestTxnCommands2.runCleaner) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) PathFilter(org.apache.hadoop.fs.PathFilter) TestTxnCommands2.runInitiator(org.apache.hadoop.hive.ql.TestTxnCommands2.runInitiator) HashMap(java.util.HashMap) Partition(org.apache.hadoop.hive.metastore.api.Partition) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Lists(com.google.common.collect.Lists) Constants(org.apache.hadoop.hive.conf.Constants) Before(org.junit.Before) Hive(org.apache.hadoop.hive.ql.metadata.Hive) StreamingException(org.apache.hive.streaming.StreamingException) Logger(org.slf4j.Logger) FileWriter(java.io.FileWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Mockito.times(org.mockito.Mockito.times) IOException(java.io.IOException) Test(org.junit.Test) CliSessionState(org.apache.hadoop.hive.cli.CliSessionState) File(java.io.File) Table(org.apache.hadoop.hive.metastore.api.Table) Mockito.verify(org.mockito.Mockito.verify) TimeUnit(java.util.concurrent.TimeUnit) Mockito(org.mockito.Mockito) Rule(org.junit.Rule) FieldSetter(org.mockito.internal.util.reflection.FieldSetter) Assert(org.junit.Assert) Collections(java.util.Collections) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) Assert.assertEquals(org.junit.Assert.assertEquals) TemporaryFolder(org.junit.rules.TemporaryFolder) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) Table(org.apache.hadoop.hive.metastore.api.Table) FileStatus(org.apache.hadoop.fs.FileStatus) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) FileSystem(org.apache.hadoop.fs.FileSystem) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) CompactionRequest(org.apache.hadoop.hive.metastore.api.CompactionRequest) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) Test(org.junit.Test)

Example 2 with TestTxnCommands2.runWorker

use of org.apache.hadoop.hive.ql.TestTxnCommands2.runWorker in project hive by apache.

the class TestCompactor method testStatsAfterCompactionPartTbl.

/**
 * After each major compaction, stats need to be updated on the table
 * 1. create a partitioned ORC backed table (Orc is currently required by ACID)
 * 2. populate with data
 * 3. compute stats
 * 4. Trigger major compaction on one of the partitions (which should update stats)
 * 5. check that stats have been updated for that partition only
 *
 * @throws Exception todo:
 *                   4. add a test with sorted table?
 */
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
    // as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
    String dbName = "default";
    String tblName = "compaction_test";
    executeStatementOnDriver("drop table if exists " + tblName, driver);
    executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + // currently ACID requires table to be bucketed
    " CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC  TBLPROPERTIES ('transactional'='true')", driver);
    StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
    HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Arrays.asList("0")).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).connect();
    connection.beginTransaction();
    connection.write("55, 'London'".getBytes());
    connection.commitTransaction();
    connection.beginTransaction();
    connection.write("56, 'Paris'".getBytes());
    connection.commitTransaction();
    connection.close();
    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(57, 'Budapest')", driver);
    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(58, 'Milano')", driver);
    execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
    TxnStore txnHandler = TxnUtils.getTxnStore(conf);
    Table table = msClient.getTable(dbName, tblName);
    // compute stats before compaction
    CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0", CompactionType.MAJOR);
    Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
    ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
    Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
    // Check basic stats are collected
    org.apache.hadoop.hive.ql.metadata.Table hiveTable = Hive.get().getTable(tblName);
    List<org.apache.hadoop.hive.ql.metadata.Partition> partitions = Hive.get().getPartitions(hiveTable);
    Map<String, String> parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1373", parameters.get("totalSize"));
    parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
    // Do a major compaction
    CompactionRequest rqst = new CompactionRequest(dbName, tblName, CompactionType.MAJOR);
    rqst.setPartitionname("bkt=0");
    txnHandler.compact(rqst);
    runWorker(conf);
    ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
    List<ShowCompactResponseElement> compacts = rsp.getCompacts();
    if (1 != compacts.size()) {
        Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
    }
    Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
    // Check basic stats are updated for partition bkt=0, but not updated for partition bkt=1
    partitions = Hive.get().getPartitions(hiveTable);
    parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "801", parameters.get("totalSize"));
    parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
}
Also used : HiveStreamingConnection(org.apache.hive.streaming.HiveStreamingConnection) DriverFactory(org.apache.hadoop.hive.ql.DriverFactory) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) CompactionRequest(org.apache.hadoop.hive.metastore.api.CompactionRequest) Arrays(java.util.Arrays) SortedSet(java.util.SortedSet) TestTxnDbUtil(org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil) StreamingConnection(org.apache.hive.streaming.StreamingConnection) FileSystem(org.apache.hadoop.fs.FileSystem) HiveStreamingConnection(org.apache.hive.streaming.HiveStreamingConnection) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) Random(java.util.Random) FileStatus(org.apache.hadoop.fs.FileStatus) CompactionType(org.apache.hadoop.hive.metastore.api.CompactionType) TestTxnCommands2.runWorker(org.apache.hadoop.hive.ql.TestTxnCommands2.runWorker) Mockito.doThrow(org.mockito.Mockito.doThrow) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OrcConf(org.apache.orc.OrcConf) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Map(java.util.Map) After(org.junit.After) Path(org.apache.hadoop.fs.Path) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) ShowCompactResponse(org.apache.hadoop.hive.metastore.api.ShowCompactResponse) FileUtil(org.apache.hadoop.fs.FileUtil) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) IDriver(org.apache.hadoop.hive.ql.IDriver) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) SessionState(org.apache.hadoop.hive.ql.session.SessionState) Retry(org.apache.hive.common.util.Retry) TxnUtils(org.apache.hadoop.hive.metastore.txn.TxnUtils) List(java.util.List) HCatUtil(org.apache.hive.hcatalog.common.HCatUtil) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) StrictDelimitedInputWriter(org.apache.hive.streaming.StrictDelimitedInputWriter) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) VISIBILITY_PATTERN(org.apache.hadoop.hive.common.AcidConstants.VISIBILITY_PATTERN) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) TestTxnCommands2.runCleaner(org.apache.hadoop.hive.ql.TestTxnCommands2.runCleaner) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) PathFilter(org.apache.hadoop.fs.PathFilter) TestTxnCommands2.runInitiator(org.apache.hadoop.hive.ql.TestTxnCommands2.runInitiator) HashMap(java.util.HashMap) Partition(org.apache.hadoop.hive.metastore.api.Partition) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Lists(com.google.common.collect.Lists) Constants(org.apache.hadoop.hive.conf.Constants) Before(org.junit.Before) Hive(org.apache.hadoop.hive.ql.metadata.Hive) StreamingException(org.apache.hive.streaming.StreamingException) Logger(org.slf4j.Logger) FileWriter(java.io.FileWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Mockito.times(org.mockito.Mockito.times) IOException(java.io.IOException) Test(org.junit.Test) CliSessionState(org.apache.hadoop.hive.cli.CliSessionState) File(java.io.File) Table(org.apache.hadoop.hive.metastore.api.Table) Mockito.verify(org.mockito.Mockito.verify) TimeUnit(java.util.concurrent.TimeUnit) Mockito(org.mockito.Mockito) Rule(org.junit.Rule) FieldSetter(org.mockito.internal.util.reflection.FieldSetter) Assert(org.junit.Assert) Collections(java.util.Collections) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) Assert.assertEquals(org.junit.Assert.assertEquals) TemporaryFolder(org.junit.rules.TemporaryFolder) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) StrictDelimitedInputWriter(org.apache.hive.streaming.StrictDelimitedInputWriter) ShowCompactResponse(org.apache.hadoop.hive.metastore.api.ShowCompactResponse) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) CompactionRequest(org.apache.hadoop.hive.metastore.api.CompactionRequest) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) Test(org.junit.Test)

Aggregations

Lists (com.google.common.collect.Lists)2 File (java.io.File)2 FileWriter (java.io.FileWriter)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Arrays (java.util.Arrays)2 Collections (java.util.Collections)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 Random (java.util.Random)2 SortedSet (java.util.SortedSet)2 TreeSet (java.util.TreeSet)2 TimeUnit (java.util.concurrent.TimeUnit)2 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 FileUtil (org.apache.hadoop.fs.FileUtil)2 Path (org.apache.hadoop.fs.Path)2