use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.
the class TestCleaner method blockedByLockTable.
@Test
public void blockedByLockTable() throws Exception {
Table t = newTable("default", "bblt", false);
addBaseFile(t, null, 20L, 20);
addDeltaFile(t, null, 21L, 22L, 2);
addDeltaFile(t, null, 23L, 24L, 2);
addDeltaFile(t, null, 21L, 24L, 4);
burnThroughTransactions(25);
CompactionRequest rqst = new CompactionRequest("default", "bblt", CompactionType.MINOR);
txnHandler.compact(rqst);
CompactionInfo ci = txnHandler.findNextToCompact("fred");
txnHandler.markCompacted(ci);
txnHandler.setRunAs(ci.id, System.getProperty("user.name"));
LockComponent comp = new LockComponent(LockType.SHARED_READ, LockLevel.TABLE, "default");
comp.setTablename("bblt");
comp.setOperationType(DataOperationType.SELECT);
List<LockComponent> components = new ArrayList<LockComponent>(1);
components.add(comp);
LockRequest req = new LockRequest(components, "me", "localhost");
LockResponse res = txnHandler.lock(req);
startCleaner();
// Check there are no compactions requests left.
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(1, compacts.size());
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
Assert.assertEquals("bblt", compacts.get(0).getTablename());
Assert.assertEquals(CompactionType.MINOR, compacts.get(0).getType());
}
use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.
the class TestCompactor method testStatsAfterCompactionPartTbl.
/**
* After each major compaction, stats need to be updated on each column of the
* table/partition which previously had stats.
* 1. create a bucketed ORC backed table (Orc is currently required by ACID)
* 2. populate 2 partitions with data
* 3. compute stats
* 4. insert some data into the table using StreamingAPI
* 5. Trigger major compaction (which should update stats)
* 6. check that stats have been updated
* @throws Exception
* todo:
* 2. add non-partitioned test
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
//as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String tblName = "compaction_test";
String tblNameStg = tblName + "_stg";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("drop table if exists " + tblNameStg, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + //currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("CREATE EXTERNAL TABLE " + tblNameStg + "(a INT, b STRING)" + " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n'" + " STORED AS TEXTFILE" + " LOCATION '" + stagingFolder.newFolder().toURI().getPath() + "'", driver);
executeStatementOnDriver("load data local inpath '" + BASIC_FILE_NAME + "' overwrite into table " + tblNameStg, driver);
execSelectAndDumpData("select * from " + tblNameStg, driver, "Dumping data for " + tblNameStg + " after load:");
executeStatementOnDriver("FROM " + tblNameStg + " INSERT INTO TABLE " + tblName + " PARTITION(bkt=0) " + "SELECT a, b where a < 2", driver);
executeStatementOnDriver("FROM " + tblNameStg + " INSERT INTO TABLE " + tblName + " PARTITION(bkt=1) " + "SELECT a, b where a >= 2", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
CompactionInfo ci = new CompactionInfo("default", tblName, "bkt=0", CompactionType.MAJOR);
LOG.debug("List of stats columns before analyze Part1: " + txnHandler.findColumnsWithStats(ci));
Worker.StatsUpdater su = Worker.StatsUpdater.init(ci, colNames, conf, System.getProperty("user.name"));
//compute stats before compaction
su.gatherStats();
LOG.debug("List of stats columns after analyze Part1: " + txnHandler.findColumnsWithStats(ci));
CompactionInfo ciPart2 = new CompactionInfo("default", tblName, "bkt=1", CompactionType.MAJOR);
LOG.debug("List of stats columns before analyze Part2: " + txnHandler.findColumnsWithStats(ci));
su = Worker.StatsUpdater.init(ciPart2, colNames, conf, System.getProperty("user.name"));
//compute stats before compaction
su.gatherStats();
LOG.debug("List of stats columns after analyze Part2: " + txnHandler.findColumnsWithStats(ci));
//now make sure we get the stats we expect for partition we are going to add data to later
Map<String, List<ColumnStatisticsObj>> stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames);
List<ColumnStatisticsObj> colStats = stats.get(ci.partName);
Assert.assertNotNull("No stats found for partition " + ci.partName, colStats);
Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName());
Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName());
LongColumnStatsData colAStats = colStats.get(0).getStatsData().getLongStats();
Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
Assert.assertEquals("highValue a", 1, colAStats.getHighValue());
Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
Assert.assertEquals("numNdv a", 1, colAStats.getNumDVs());
StringColumnStatsData colBStats = colStats.get(1).getStatsData().getStringStats();
Assert.assertEquals("maxColLen b", 3, colBStats.getMaxColLen());
Assert.assertEquals("avgColLen b", 3.0, colBStats.getAvgColLen(), 0.01);
Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
Assert.assertEquals("nunDVs", 2, colBStats.getNumDVs());
//now save stats for partition we won't modify
stats = msClient.getPartitionColumnStatistics(ciPart2.dbname, ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames);
colStats = stats.get(ciPart2.partName);
LongColumnStatsData colAStatsPart2 = colStats.get(0).getStatsData().getLongStats();
StringColumnStatsData colBStatsPart2 = colStats.get(1).getStatsData().getStringStats();
HiveEndPoint endPt = new HiveEndPoint(null, ci.dbname, ci.tableName, Arrays.asList("0"));
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] { "a", "b" }, ",", endPt);
/*next call will eventually end up in HiveEndPoint.createPartitionIfNotExists() which
makes an operation on Driver
* and starts it's own CliSessionState and then closes it, which removes it from ThreadLoacal;
* thus the session
* created in this class is gone after this; I fixed it in HiveEndPoint*/
StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName());
TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer);
txnBatch.beginNextTransaction();
Assert.assertEquals(TransactionBatch.TxnState.OPEN, txnBatch.getCurrentTransactionState());
txnBatch.write("50,Kiev".getBytes());
txnBatch.write("51,St. Petersburg".getBytes());
txnBatch.write("44,Boston".getBytes());
txnBatch.commit();
txnBatch.beginNextTransaction();
txnBatch.write("52,Tel Aviv".getBytes());
txnBatch.write("53,Atlantis".getBytes());
txnBatch.write("53,Boston".getBytes());
txnBatch.commit();
txnBatch.close();
connection.close();
execSelectAndDumpData("select * from " + ci.getFullTableName(), driver, ci.getFullTableName());
//so now we have written some new data to bkt=0 and it shows up
CompactionRequest rqst = new CompactionRequest(ci.dbname, ci.tableName, CompactionType.MAJOR);
rqst.setPartitionname(ci.partName);
txnHandler.compact(rqst);
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setHiveConf(conf);
AtomicBoolean stop = new AtomicBoolean();
AtomicBoolean looped = new AtomicBoolean();
stop.set(true);
t.init(stop, looped);
t.run();
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts.toString());
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames);
colStats = stats.get(ci.partName);
Assert.assertNotNull("No stats found for partition " + ci.partName, colStats);
Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName());
Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName());
colAStats = colStats.get(0).getStatsData().getLongStats();
Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
Assert.assertEquals("highValue a", 53, colAStats.getHighValue());
Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
Assert.assertEquals("numNdv a", 6, colAStats.getNumDVs());
colBStats = colStats.get(1).getStatsData().getStringStats();
Assert.assertEquals("maxColLen b", 14, colBStats.getMaxColLen());
//cast it to long to get rid of periodic decimal
Assert.assertEquals("avgColLen b", (long) 6.1111111111, (long) colBStats.getAvgColLen());
Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
Assert.assertEquals("nunDVs", 10, colBStats.getNumDVs());
//now check that stats for partition we didn't modify did not change
stats = msClient.getPartitionColumnStatistics(ciPart2.dbname, ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames);
colStats = stats.get(ciPart2.partName);
Assert.assertEquals("Expected stats for " + ciPart2.partName + " to stay the same", colAStatsPart2, colStats.get(0).getStatsData().getLongStats());
Assert.assertEquals("Expected stats for " + ciPart2.partName + " to stay the same", colBStatsPart2, colStats.get(1).getStatsData().getStringStats());
}
use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.
the class Cleaner method run.
@Override
public void run() {
if (cleanerCheckInterval == 0) {
cleanerCheckInterval = conf.getTimeVar(HiveConf.ConfVars.HIVE_COMPACTOR_CLEANER_RUN_INTERVAL, TimeUnit.MILLISECONDS);
}
do {
// This is solely for testing. It checks if the test has set the looped value to false,
// and if so remembers that and then sets it to true at the end. We have to check here
// first to make sure we go through a complete iteration of the loop before resetting it.
boolean setLooped = !looped.get();
TxnStore.MutexAPI.LockHandle handle = null;
long startedAt = -1;
// so wrap it in a big catch Throwable statement.
try {
handle = txnHandler.getMutexAPI().acquireLock(TxnStore.MUTEX_KEY.Cleaner.name());
startedAt = System.currentTimeMillis();
// First look for all the compactions that are waiting to be cleaned. If we have not
// seen an entry before, look for all the locks held on that table or partition and
// record them. We will then only clean the partition once all of those locks have been
// released. This way we avoid removing the files while they are in use,
// while at the same time avoiding starving the cleaner as new readers come along.
// This works because we know that any reader who comes along after the worker thread has
// done the compaction will read the more up to date version of the data (either in a
// newer delta or in a newer base).
List<CompactionInfo> toClean = txnHandler.findReadyToClean();
{
/**
* Since there may be more than 1 instance of Cleaner running we may have state info
* for items which were cleaned by instances. Here we remove them.
*
* In the long run if we add end_time to compaction_queue, then we can check that
* hive_locks.acquired_at > compaction_queue.end_time + safety_buffer in which case
* we know the lock owner is reading files created by this compaction or later.
* The advantage is that we don't have to store the locks.
*/
Set<Long> currentToCleanSet = new HashSet<>();
for (CompactionInfo ci : toClean) {
currentToCleanSet.add(ci.id);
}
Set<Long> cleanPerformedByOthers = new HashSet<>();
for (long id : compactId2CompactInfoMap.keySet()) {
if (!currentToCleanSet.contains(id)) {
cleanPerformedByOthers.add(id);
}
}
for (long id : cleanPerformedByOthers) {
compactId2CompactInfoMap.remove(id);
compactId2LockMap.remove(id);
}
}
if (toClean.size() > 0 || compactId2LockMap.size() > 0) {
ShowLocksResponse locksResponse = txnHandler.showLocks(new ShowLocksRequest());
for (CompactionInfo ci : toClean) {
// add it to our queue.
if (!compactId2LockMap.containsKey(ci.id)) {
compactId2LockMap.put(ci.id, findRelatedLocks(ci, locksResponse));
compactId2CompactInfoMap.put(ci.id, ci);
}
}
// Now, for each entry in the queue, see if all of the associated locks are clear so we
// can clean
Set<Long> currentLocks = buildCurrentLockSet(locksResponse);
List<Long> expiredLocks = new ArrayList<Long>();
List<Long> compactionsCleaned = new ArrayList<Long>();
try {
for (Map.Entry<Long, Set<Long>> queueEntry : compactId2LockMap.entrySet()) {
boolean sawLock = false;
for (Long lockId : queueEntry.getValue()) {
if (currentLocks.contains(lockId)) {
sawLock = true;
break;
} else {
expiredLocks.add(lockId);
}
}
if (!sawLock) {
// Remember to remove this when we're out of the loop,
// we can't do it in the loop or we'll get a concurrent modification exception.
compactionsCleaned.add(queueEntry.getKey());
//Future thought: this may be expensive so consider having a thread pool run in parallel
clean(compactId2CompactInfoMap.get(queueEntry.getKey()));
} else {
// Remove the locks we didn't see so we don't look for them again next time
for (Long lockId : expiredLocks) {
queueEntry.getValue().remove(lockId);
}
}
}
} finally {
if (compactionsCleaned.size() > 0) {
for (Long compactId : compactionsCleaned) {
compactId2LockMap.remove(compactId);
compactId2CompactInfoMap.remove(compactId);
}
}
}
}
} catch (Throwable t) {
LOG.error("Caught an exception in the main loop of compactor cleaner, " + StringUtils.stringifyException(t));
} finally {
if (handle != null) {
handle.releaseLocks();
}
}
if (setLooped) {
looped.set(true);
}
// Now, go back to bed until it's time to do this again
long elapsedTime = System.currentTimeMillis() - startedAt;
if (elapsedTime >= cleanerCheckInterval || stop.get()) {
continue;
} else {
try {
Thread.sleep(cleanerCheckInterval - elapsedTime);
} catch (InterruptedException ie) {
// What can I do about it?
}
}
} while (!stop.get());
}
use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.
the class Initiator method run.
@Override
public void run() {
// so wrap it in a big catch Throwable statement.
try {
recoverFailedCompactions(false);
int abortedThreshold = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_COMPACTOR_ABORTEDTXN_THRESHOLD);
// HiveMetaStore.
do {
long startedAt = -1;
TxnStore.MutexAPI.LockHandle handle = null;
// don't doom the entire thread.
try {
handle = txnHandler.getMutexAPI().acquireLock(TxnStore.MUTEX_KEY.Initiator.name());
startedAt = System.currentTimeMillis();
//todo: add method to only get current i.e. skip history - more efficient
ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest());
ValidTxnList txns = TxnUtils.createValidCompactTxnList(txnHandler.getOpenTxnsInfo());
Set<CompactionInfo> potentials = txnHandler.findPotentialCompactions(abortedThreshold);
LOG.debug("Found " + potentials.size() + " potential compactions, " + "checking to see if we should compact any of them");
for (CompactionInfo ci : potentials) {
LOG.info("Checking to see if we should compact " + ci.getFullPartitionName());
try {
Table t = resolveTable(ci);
if (t == null) {
// Most likely this means it's a temp table
LOG.info("Can't find table " + ci.getFullTableName() + ", assuming it's a temp " + "table or has been dropped and moving on.");
continue;
}
// check if no compaction set for this table
if (noAutoCompactSet(t)) {
LOG.info("Table " + tableName(t) + " marked " + hive_metastoreConstants.TABLE_NO_AUTO_COMPACT + "=true so we will not compact it.");
continue;
}
// then it's a dynamic partitioning case and we shouldn't check the table itself.
if (t.getPartitionKeys() != null && t.getPartitionKeys().size() > 0 && ci.partName == null) {
LOG.debug("Skipping entry for " + ci.getFullTableName() + " as it is from dynamic" + " partitioning");
continue;
}
//the time currentCompactions is generated and now
if (lookForCurrentCompactions(currentCompactions, ci)) {
LOG.debug("Found currently initiated or working compaction for " + ci.getFullPartitionName() + " so we will not initiate another compaction");
continue;
}
if (txnHandler.checkFailedCompactions(ci)) {
LOG.warn("Will not initiate compaction for " + ci.getFullPartitionName() + " since last " + HiveConf.ConfVars.COMPACTOR_INITIATOR_FAILED_THRESHOLD + " attempts to compact it failed.");
txnHandler.markFailed(ci);
continue;
}
// Figure out who we should run the file operations as
Partition p = resolvePartition(ci);
if (p == null && ci.partName != null) {
LOG.info("Can't find partition " + ci.getFullPartitionName() + ", assuming it has been dropped and moving on.");
continue;
}
StorageDescriptor sd = resolveStorageDescriptor(t, p);
String runAs = findUserToRunAs(sd.getLocation(), t);
/*Future thought: checkForCompaction will check a lot of file metadata and may be expensive.
* Long term we should consider having a thread pool here and running checkForCompactionS
* in parallel*/
CompactionType compactionNeeded = checkForCompaction(ci, txns, sd, t.getParameters(), runAs);
if (compactionNeeded != null)
requestCompaction(ci, runAs, compactionNeeded);
} catch (Throwable t) {
LOG.error("Caught exception while trying to determine if we should compact " + ci + ". Marking failed to avoid repeated failures, " + "" + StringUtils.stringifyException(t));
txnHandler.markFailed(ci);
}
}
// Check for timed out remote workers.
recoverFailedCompactions(true);
// Clean anything from the txns table that has no components left in txn_components.
txnHandler.cleanEmptyAbortedTxns();
} catch (Throwable t) {
LOG.error("Initiator loop caught unexpected exception this time through the loop: " + StringUtils.stringifyException(t));
} finally {
if (handle != null) {
handle.releaseLocks();
}
}
long elapsedTime = System.currentTimeMillis() - startedAt;
if (elapsedTime >= checkInterval || stop.get())
continue;
else
Thread.sleep(checkInterval - elapsedTime);
} while (!stop.get());
} catch (Throwable t) {
LOG.error("Caught an exception in the main loop of compactor initiator, exiting " + StringUtils.stringifyException(t));
}
}
use of org.apache.hadoop.hive.metastore.txn.CompactionInfo in project hive by apache.
the class Worker method run.
//todo: this doesn;t check if compaction is already running (even though Initiator does but we
// don't go through Initiator for user initiated compactions)
@Override
public void run() {
do {
boolean launchedJob = false;
// so wrap it in a big catch Throwable statement.
try {
final CompactionInfo ci = txnHandler.findNextToCompact(name);
if (ci == null && !stop.get()) {
try {
Thread.sleep(SLEEP_TIME);
continue;
} catch (InterruptedException e) {
LOG.warn("Worker thread sleep interrupted " + e.getMessage());
continue;
}
}
// Find the table we will be working with.
Table t1 = null;
try {
t1 = resolveTable(ci);
if (t1 == null) {
LOG.info("Unable to find table " + ci.getFullTableName() + ", assuming it was dropped and moving on.");
txnHandler.markCleaned(ci);
continue;
}
} catch (MetaException e) {
txnHandler.markCleaned(ci);
continue;
}
// This chicanery is to get around the fact that the table needs to be final in order to
// go into the doAs below.
final Table t = t1;
// Find the partition we will be working with, if there is one.
Partition p = null;
try {
p = resolvePartition(ci);
if (p == null && ci.partName != null) {
LOG.info("Unable to find partition " + ci.getFullPartitionName() + ", assuming it was dropped and moving on.");
txnHandler.markCleaned(ci);
continue;
}
} catch (Exception e) {
txnHandler.markCleaned(ci);
continue;
}
// Find the appropriate storage descriptor
final StorageDescriptor sd = resolveStorageDescriptor(t, p);
// Check that the table or partition isn't sorted, as we don't yet support that.
if (sd.getSortCols() != null && !sd.getSortCols().isEmpty()) {
LOG.error("Attempt to compact sorted table, which is not yet supported!");
txnHandler.markCleaned(ci);
continue;
}
final boolean isMajor = ci.isMajorCompaction();
final ValidTxnList txns = TxnUtils.createValidCompactTxnList(txnHandler.getOpenTxnsInfo());
LOG.debug("ValidCompactTxnList: " + txns.writeToString());
txnHandler.setCompactionHighestTxnId(ci, txns.getHighWatermark());
final StringBuilder jobName = new StringBuilder(name);
jobName.append("-compactor-");
jobName.append(ci.getFullPartitionName());
// Determine who to run as
String runAs;
if (ci.runAs == null) {
runAs = findUserToRunAs(sd.getLocation(), t);
txnHandler.setRunAs(ci.id, runAs);
} else {
runAs = ci.runAs;
}
LOG.info("Starting " + ci.type.toString() + " compaction for " + ci.getFullPartitionName());
final StatsUpdater su = StatsUpdater.init(ci, txnHandler.findColumnsWithStats(ci), conf, runJobAsSelf(runAs) ? runAs : t.getOwner());
final CompactorMR mr = new CompactorMR();
launchedJob = true;
try {
if (runJobAsSelf(runAs)) {
mr.run(conf, jobName.toString(), t, sd, txns, ci, su, txnHandler);
} else {
UserGroupInformation ugi = UserGroupInformation.createProxyUser(t.getOwner(), UserGroupInformation.getLoginUser());
ugi.doAs(new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
mr.run(conf, jobName.toString(), t, sd, txns, ci, su, txnHandler);
return null;
}
});
try {
FileSystem.closeAllForUGI(ugi);
} catch (IOException exception) {
LOG.error("Could not clean up file-system handles for UGI: " + ugi + " for " + ci.getFullPartitionName(), exception);
}
}
txnHandler.markCompacted(ci);
if (conf.getBoolVar(HiveConf.ConfVars.HIVE_IN_TEST)) {
mrJob = mr.getMrJob();
}
} catch (Exception e) {
LOG.error("Caught exception while trying to compact " + ci + ". Marking failed to avoid repeated failures, " + StringUtils.stringifyException(e));
txnHandler.markFailed(ci);
}
} catch (Throwable t) {
LOG.error("Caught an exception in the main loop of compactor worker " + name + ", " + StringUtils.stringifyException(t));
}
// a bit before we restart the loop.
if (!launchedJob && !stop.get()) {
try {
Thread.sleep(SLEEP_TIME);
} catch (InterruptedException e) {
}
}
} while (!stop.get());
}
Aggregations