Search in sources :

Example 16 with CompactionInfo

use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.

the class TestCompactor method testStatsAfterCompactionPartTbl.

/**
 * After each major compaction, stats need to be updated on the table
 * 1. create a partitioned ORC backed table (Orc is currently required by ACID)
 * 2. populate with data
 * 3. compute stats
 * 4. Trigger major compaction on one of the partitions (which should update stats)
 * 5. check that stats have been updated for that partition only
 *
 * @throws Exception todo:
 *                   4. add a test with sorted table?
 */
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
    // as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
    String dbName = "default";
    String tblName = "compaction_test";
    executeStatementOnDriver("drop table if exists " + tblName, driver);
    executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + // currently ACID requires table to be bucketed
    " CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC  TBLPROPERTIES ('transactional'='true')", driver);
    StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
    HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Arrays.asList("0")).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).connect();
    connection.beginTransaction();
    connection.write("55, 'London'".getBytes());
    connection.commitTransaction();
    connection.beginTransaction();
    connection.write("56, 'Paris'".getBytes());
    connection.commitTransaction();
    connection.close();
    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(57, 'Budapest')", driver);
    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(58, 'Milano')", driver);
    execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
    TxnStore txnHandler = TxnUtils.getTxnStore(conf);
    Table table = msClient.getTable(dbName, tblName);
    // compute stats before compaction
    CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0", CompactionType.MAJOR);
    Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
    ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
    Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
    // Check basic stats are collected
    org.apache.hadoop.hive.ql.metadata.Table hiveTable = Hive.get().getTable(tblName);
    List<org.apache.hadoop.hive.ql.metadata.Partition> partitions = Hive.get().getPartitions(hiveTable);
    Map<String, String> parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1373", parameters.get("totalSize"));
    parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
    // Do a major compaction
    CompactionRequest rqst = new CompactionRequest(dbName, tblName, CompactionType.MAJOR);
    rqst.setPartitionname("bkt=0");
    txnHandler.compact(rqst);
    runWorker(conf);
    ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
    List<ShowCompactResponseElement> compacts = rsp.getCompacts();
    if (1 != compacts.size()) {
        Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
    }
    Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
    // Check basic stats are updated for partition bkt=0, but not updated for partition bkt=1
    partitions = Hive.get().getPartitions(hiveTable);
    parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "801", parameters.get("totalSize"));
    parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
}
Also used : HiveStreamingConnection(org.apache.hive.streaming.HiveStreamingConnection) DriverFactory(org.apache.hadoop.hive.ql.DriverFactory) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) CompactionRequest(org.apache.hadoop.hive.metastore.api.CompactionRequest) Arrays(java.util.Arrays) SortedSet(java.util.SortedSet) TestTxnDbUtil(org.apache.hadoop.hive.metastore.utils.TestTxnDbUtil) StreamingConnection(org.apache.hive.streaming.StreamingConnection) FileSystem(org.apache.hadoop.fs.FileSystem) HiveStreamingConnection(org.apache.hive.streaming.HiveStreamingConnection) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) Random(java.util.Random) FileStatus(org.apache.hadoop.fs.FileStatus) CompactionType(org.apache.hadoop.hive.metastore.api.CompactionType) TestTxnCommands2.runWorker(org.apache.hadoop.hive.ql.TestTxnCommands2.runWorker) Mockito.doThrow(org.mockito.Mockito.doThrow) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OrcConf(org.apache.orc.OrcConf) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Map(java.util.Map) After(org.junit.After) Path(org.apache.hadoop.fs.Path) Reader(org.apache.hadoop.hive.ql.io.orc.Reader) ShowCompactResponse(org.apache.hadoop.hive.metastore.api.ShowCompactResponse) FileUtil(org.apache.hadoop.fs.FileUtil) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) IDriver(org.apache.hadoop.hive.ql.IDriver) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) SessionState(org.apache.hadoop.hive.ql.session.SessionState) Retry(org.apache.hive.common.util.Retry) TxnUtils(org.apache.hadoop.hive.metastore.txn.TxnUtils) List(java.util.List) HCatUtil(org.apache.hive.hcatalog.common.HCatUtil) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) StrictDelimitedInputWriter(org.apache.hive.streaming.StrictDelimitedInputWriter) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) VISIBILITY_PATTERN(org.apache.hadoop.hive.common.AcidConstants.VISIBILITY_PATTERN) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) TestTxnCommands2.runCleaner(org.apache.hadoop.hive.ql.TestTxnCommands2.runCleaner) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) PathFilter(org.apache.hadoop.fs.PathFilter) TestTxnCommands2.runInitiator(org.apache.hadoop.hive.ql.TestTxnCommands2.runInitiator) HashMap(java.util.HashMap) Partition(org.apache.hadoop.hive.metastore.api.Partition) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Lists(com.google.common.collect.Lists) Constants(org.apache.hadoop.hive.conf.Constants) Before(org.junit.Before) Hive(org.apache.hadoop.hive.ql.metadata.Hive) StreamingException(org.apache.hive.streaming.StreamingException) Logger(org.slf4j.Logger) FileWriter(java.io.FileWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Mockito.times(org.mockito.Mockito.times) IOException(java.io.IOException) Test(org.junit.Test) CliSessionState(org.apache.hadoop.hive.cli.CliSessionState) File(java.io.File) Table(org.apache.hadoop.hive.metastore.api.Table) Mockito.verify(org.mockito.Mockito.verify) TimeUnit(java.util.concurrent.TimeUnit) Mockito(org.mockito.Mockito) Rule(org.junit.Rule) FieldSetter(org.mockito.internal.util.reflection.FieldSetter) Assert(org.junit.Assert) Collections(java.util.Collections) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) Assert.assertEquals(org.junit.Assert.assertEquals) TemporaryFolder(org.junit.rules.TemporaryFolder) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) StrictDelimitedInputWriter(org.apache.hive.streaming.StrictDelimitedInputWriter) ShowCompactResponse(org.apache.hadoop.hive.metastore.api.ShowCompactResponse) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) CompactionRequest(org.apache.hadoop.hive.metastore.api.CompactionRequest) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) Test(org.junit.Test)

Example 17 with CompactionInfo

use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.

the class TestCrudCompactorOnTez method testStatsAfterQueryCompactionOnTez.

/**
 * After each major compaction, stats need to be updated on the table
 * 1. create an ORC backed table (Orc is currently required by ACID)
 * 2. populate with data
 * 3. compute stats
 * 4. Trigger major compaction (which should update stats)
 * 5. check that stats have been updated
 */
@Test
public void testStatsAfterQueryCompactionOnTez() throws Exception {
    // as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
    String dbName = "default";
    String tblName = "compaction_test";
    executeStatementOnDriver("drop table if exists " + tblName, driver);
    executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + // currently ACID requires table to be bucketed
    " CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC  TBLPROPERTIES ('transactional'='true')", driver);
    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " values(55, 'London')", driver);
    executeStatementOnDriver("INSERT INTO TABLE " + tblName + " values(56, 'Paris')", driver);
    execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
    TxnStore txnHandler = TxnUtils.getTxnStore(conf);
    Table table = msClient.getTable(dbName, tblName);
    // compute stats before compaction
    CompactionInfo ci = new CompactionInfo(dbName, tblName, null, CompactionType.MAJOR);
    Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
    // Check basic stats are collected
    Map<String, String> parameters = Hive.get().getTable(tblName).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "1434", parameters.get("totalSize"));
    // Do a major compaction
    CompactorTestUtil.runCompaction(conf, dbName, tblName, CompactionType.MAJOR, true);
    ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
    List<ShowCompactResponseElement> compacts = rsp.getCompacts();
    if (1 != compacts.size()) {
        Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
    }
    Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
    // Check basic stats are updated
    parameters = Hive.get().getTable(tblName).getParameters();
    Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
    Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
    Assert.assertEquals("The total table size is differing from the expected", "727", parameters.get("totalSize"));
}
Also used : Table(org.apache.hadoop.hive.metastore.api.Table) ShowCompactResponse(org.apache.hadoop.hive.metastore.api.ShowCompactResponse) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo) ShowCompactRequest(org.apache.hadoop.hive.metastore.api.ShowCompactRequest) TxnStore(org.apache.hadoop.hive.metastore.txn.TxnStore) ShowCompactResponseElement(org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement) Test(org.junit.Test)

Example 18 with CompactionInfo

use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.

the class HMSHandler method commit_txn.

@Override
public void commit_txn(CommitTxnRequest rqst) throws TException {
    boolean isReplayedReplTxn = TxnType.REPL_CREATED.equals(rqst.getTxn_type());
    boolean isHiveReplTxn = rqst.isSetReplPolicy() && TxnType.DEFAULT.equals(rqst.getTxn_type());
    // in replication flow, the write notification log table will be updated here.
    if (rqst.isSetWriteEventInfos() && isReplayedReplTxn) {
        assert (rqst.isSetReplPolicy());
        long targetTxnId = getTxnHandler().getTargetTxnId(rqst.getReplPolicy(), rqst.getTxnid());
        if (targetTxnId < 0) {
            // looks like a retry
            return;
        }
        for (WriteEventInfo writeEventInfo : rqst.getWriteEventInfos()) {
            String[] filesAdded = ReplChangeManager.getListFromSeparatedString(writeEventInfo.getFiles());
            List<String> partitionValue = null;
            Partition ptnObj = null;
            String root;
            Table tbl = getTblObject(writeEventInfo.getDatabase(), writeEventInfo.getTable(), null);
            if (writeEventInfo.getPartition() != null && !writeEventInfo.getPartition().isEmpty()) {
                partitionValue = Warehouse.getPartValuesFromPartName(writeEventInfo.getPartition());
                ptnObj = getPartitionObj(writeEventInfo.getDatabase(), writeEventInfo.getTable(), partitionValue, tbl);
                root = ptnObj.getSd().getLocation();
            } else {
                root = tbl.getSd().getLocation();
            }
            InsertEventRequestData insertData = new InsertEventRequestData();
            insertData.setReplace(true);
            // warehouse. Need to transform them to target warehouse using table or partition object location.
            for (String file : filesAdded) {
                String[] decodedPath = ReplChangeManager.decodeFileUri(file);
                String name = (new Path(decodedPath[0])).getName();
                Path newPath = FileUtils.getTransformedPath(name, decodedPath[3], root);
                insertData.addToFilesAdded(newPath.toUri().toString());
                insertData.addToSubDirectoryList(decodedPath[3]);
                try {
                    insertData.addToFilesAddedChecksum(ReplChangeManager.checksumFor(newPath, newPath.getFileSystem(conf)));
                } catch (IOException e) {
                    LOG.error("failed to get checksum for the file " + newPath + " with error: " + e.getMessage());
                    throw new TException(e.getMessage());
                }
            }
            WriteNotificationLogRequest wnRqst = new WriteNotificationLogRequest(targetTxnId, writeEventInfo.getWriteId(), writeEventInfo.getDatabase(), writeEventInfo.getTable(), insertData);
            if (partitionValue != null) {
                wnRqst.setPartitionVals(partitionValue);
            }
            addTxnWriteNotificationLog(tbl, ptnObj, wnRqst);
        }
    }
    getTxnHandler().commitTxn(rqst);
    if (listeners != null && !listeners.isEmpty() && !isHiveReplTxn) {
        MetaStoreListenerNotifier.notifyEvent(listeners, EventType.COMMIT_TXN, new CommitTxnEvent(rqst.getTxnid(), this));
        Optional<CompactionInfo> compactionInfo = getTxnHandler().getCompactionByTxnId(rqst.getTxnid());
        if (compactionInfo.isPresent()) {
            MetaStoreListenerNotifier.notifyEvent(listeners, EventType.COMMIT_COMPACTION, new CommitCompactionEvent(rqst.getTxnid(), compactionInfo.get(), this));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TException(org.apache.thrift.TException) IOException(java.io.IOException) CompactionInfo(org.apache.hadoop.hive.metastore.txn.CompactionInfo)

Aggregations

CompactionInfo (org.apache.hadoop.hive.metastore.txn.CompactionInfo)18 Table (org.apache.hadoop.hive.metastore.api.Table)13 Test (org.junit.Test)11 ShowCompactRequest (org.apache.hadoop.hive.metastore.api.ShowCompactRequest)10 ShowCompactResponse (org.apache.hadoop.hive.metastore.api.ShowCompactResponse)10 ArrayList (java.util.ArrayList)9 CompactionRequest (org.apache.hadoop.hive.metastore.api.CompactionRequest)9 ShowCompactResponseElement (org.apache.hadoop.hive.metastore.api.ShowCompactResponseElement)8 Partition (org.apache.hadoop.hive.metastore.api.Partition)7 IOException (java.io.IOException)5 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)5 Path (org.apache.hadoop.fs.Path)5 ValidTxnList (org.apache.hadoop.hive.common.ValidTxnList)5 HiveConf (org.apache.hadoop.hive.conf.HiveConf)5 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)5 CompactionType (org.apache.hadoop.hive.metastore.api.CompactionType)4 GetValidWriteIdsRequest (org.apache.hadoop.hive.metastore.api.GetValidWriteIdsRequest)4 LockComponent (org.apache.hadoop.hive.metastore.api.LockComponent)4 LockRequest (org.apache.hadoop.hive.metastore.api.LockRequest)4 LockResponse (org.apache.hadoop.hive.metastore.api.LockResponse)4