use of org.apache.hadoop.hive.metastore.api.ShowCompactResponse in project hive by apache.
the class TestTxnCommands2 method countCompacts.
private static CompactionsByState countCompacts(TxnStore txnHandler) throws MetaException {
ShowCompactResponse resp = txnHandler.showCompact(new ShowCompactRequest());
CompactionsByState compactionsByState = new CompactionsByState();
compactionsByState.total = resp.getCompactsSize();
for (ShowCompactResponseElement compact : resp.getCompacts()) {
if (TxnStore.FAILED_RESPONSE.equals(compact.getState())) {
compactionsByState.failed++;
} else if (TxnStore.CLEANING_RESPONSE.equals(compact.getState())) {
compactionsByState.readyToClean++;
} else if (TxnStore.INITIATED_RESPONSE.equals(compact.getState())) {
compactionsByState.initiated++;
} else if (TxnStore.SUCCEEDED_RESPONSE.equals(compact.getState())) {
compactionsByState.succeeded++;
} else if (TxnStore.WORKING_RESPONSE.equals(compact.getState())) {
compactionsByState.working++;
} else if (TxnStore.ATTEMPTED_RESPONSE.equals(compact.getState())) {
compactionsByState.attempted++;
} else {
throw new IllegalStateException("Unexpected state: " + compact.getState());
}
}
return compactionsByState;
}
use of org.apache.hadoop.hive.metastore.api.ShowCompactResponse in project hive by apache.
the class TestTxnHandler method testCompactMajorWithPartition.
@Test
public void testCompactMajorWithPartition() throws Exception {
CompactionRequest rqst = new CompactionRequest("foo", "bar", CompactionType.MAJOR);
rqst.setPartitionname("ds=today");
txnHandler.compact(rqst);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
assertEquals(1, compacts.size());
ShowCompactResponseElement c = compacts.get(0);
assertEquals("foo", c.getDbname());
assertEquals("bar", c.getTablename());
assertEquals("ds=today", c.getPartitionname());
assertEquals(CompactionType.MAJOR, c.getType());
assertEquals("initiated", c.getState());
assertEquals(0L, c.getStart());
}
use of org.apache.hadoop.hive.metastore.api.ShowCompactResponse in project hive by apache.
the class TestCompactor method testStatsAfterCompactionPartTbl.
/**
* After each major compaction, stats need to be updated on each column of the
* table/partition which previously had stats.
* 1. create a bucketed ORC backed table (Orc is currently required by ACID)
* 2. populate 2 partitions with data
* 3. compute stats
* 4. insert some data into the table using StreamingAPI
* 5. Trigger major compaction (which should update stats)
* 6. check that stats have been updated
* @throws Exception
* todo:
* 2. add non-partitioned test
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
// as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String tblName = "compaction_test";
String tblNameStg = tblName + "_stg";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("drop table if exists " + tblNameStg, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("CREATE EXTERNAL TABLE " + tblNameStg + "(a INT, b STRING)" + " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n'" + " STORED AS TEXTFILE" + " LOCATION '" + stagingFolder.newFolder().toURI().getPath() + "'", driver);
executeStatementOnDriver("load data local inpath '" + BASIC_FILE_NAME + "' overwrite into table " + tblNameStg, driver);
execSelectAndDumpData("select * from " + tblNameStg, driver, "Dumping data for " + tblNameStg + " after load:");
executeStatementOnDriver("FROM " + tblNameStg + " INSERT INTO TABLE " + tblName + " PARTITION(bkt=0) " + "SELECT a, b where a < 2", driver);
executeStatementOnDriver("FROM " + tblNameStg + " INSERT INTO TABLE " + tblName + " PARTITION(bkt=1) " + "SELECT a, b where a >= 2", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
CompactionInfo ci = new CompactionInfo("default", tblName, "bkt=0", CompactionType.MAJOR);
LOG.debug("List of stats columns before analyze Part1: " + txnHandler.findColumnsWithStats(ci));
Worker.StatsUpdater su = Worker.StatsUpdater.init(ci, colNames, conf, System.getProperty("user.name"));
// compute stats before compaction
su.gatherStats();
LOG.debug("List of stats columns after analyze Part1: " + txnHandler.findColumnsWithStats(ci));
CompactionInfo ciPart2 = new CompactionInfo("default", tblName, "bkt=1", CompactionType.MAJOR);
LOG.debug("List of stats columns before analyze Part2: " + txnHandler.findColumnsWithStats(ci));
su = Worker.StatsUpdater.init(ciPart2, colNames, conf, System.getProperty("user.name"));
// compute stats before compaction
su.gatherStats();
LOG.debug("List of stats columns after analyze Part2: " + txnHandler.findColumnsWithStats(ci));
// now make sure we get the stats we expect for partition we are going to add data to later
Map<String, List<ColumnStatisticsObj>> stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames);
List<ColumnStatisticsObj> colStats = stats.get(ci.partName);
Assert.assertNotNull("No stats found for partition " + ci.partName, colStats);
Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName());
Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName());
LongColumnStatsData colAStats = colStats.get(0).getStatsData().getLongStats();
Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
Assert.assertEquals("highValue a", 1, colAStats.getHighValue());
Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
Assert.assertEquals("numNdv a", 1, colAStats.getNumDVs());
StringColumnStatsData colBStats = colStats.get(1).getStatsData().getStringStats();
Assert.assertEquals("maxColLen b", 3, colBStats.getMaxColLen());
Assert.assertEquals("avgColLen b", 3.0, colBStats.getAvgColLen(), 0.01);
Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
Assert.assertEquals("nunDVs", 3, colBStats.getNumDVs());
// now save stats for partition we won't modify
stats = msClient.getPartitionColumnStatistics(ciPart2.dbname, ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames);
colStats = stats.get(ciPart2.partName);
LongColumnStatsData colAStatsPart2 = colStats.get(0).getStatsData().getLongStats();
StringColumnStatsData colBStatsPart2 = colStats.get(1).getStatsData().getStringStats();
HiveEndPoint endPt = new HiveEndPoint(null, ci.dbname, ci.tableName, Arrays.asList("0"));
DelimitedInputWriter writer = new DelimitedInputWriter(new String[] { "a", "b" }, ",", endPt);
/*next call will eventually end up in HiveEndPoint.createPartitionIfNotExists() which
makes an operation on Driver
* and starts it's own CliSessionState and then closes it, which removes it from ThreadLoacal;
* thus the session
* created in this class is gone after this; I fixed it in HiveEndPoint*/
StreamingConnection connection = endPt.newConnection(true, "UT_" + Thread.currentThread().getName());
TransactionBatch txnBatch = connection.fetchTransactionBatch(2, writer);
txnBatch.beginNextTransaction();
Assert.assertEquals(TransactionBatch.TxnState.OPEN, txnBatch.getCurrentTransactionState());
txnBatch.write("50,Kiev".getBytes());
txnBatch.write("51,St. Petersburg".getBytes());
txnBatch.write("44,Boston".getBytes());
txnBatch.commit();
txnBatch.beginNextTransaction();
txnBatch.write("52,Tel Aviv".getBytes());
txnBatch.write("53,Atlantis".getBytes());
txnBatch.write("53,Boston".getBytes());
txnBatch.commit();
txnBatch.close();
connection.close();
execSelectAndDumpData("select * from " + ci.getFullTableName(), driver, ci.getFullTableName());
// so now we have written some new data to bkt=0 and it shows up
CompactionRequest rqst = new CompactionRequest(ci.dbname, ci.tableName, CompactionType.MAJOR);
rqst.setPartitionname(ci.partName);
txnHandler.compact(rqst);
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setConf(conf);
AtomicBoolean stop = new AtomicBoolean();
AtomicBoolean looped = new AtomicBoolean();
stop.set(true);
t.init(stop, looped);
t.run();
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts.toString());
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
stats = msClient.getPartitionColumnStatistics(ci.dbname, ci.tableName, Arrays.asList(ci.partName), colNames);
colStats = stats.get(ci.partName);
Assert.assertNotNull("No stats found for partition " + ci.partName, colStats);
Assert.assertEquals("Expected column 'a' at index 0", "a", colStats.get(0).getColName());
Assert.assertEquals("Expected column 'b' at index 1", "b", colStats.get(1).getColName());
colAStats = colStats.get(0).getStatsData().getLongStats();
Assert.assertEquals("lowValue a", 1, colAStats.getLowValue());
Assert.assertEquals("highValue a", 53, colAStats.getHighValue());
Assert.assertEquals("numNulls a", 0, colAStats.getNumNulls());
Assert.assertEquals("numNdv a", 6, colAStats.getNumDVs());
colBStats = colStats.get(1).getStatsData().getStringStats();
Assert.assertEquals("maxColLen b", 14, colBStats.getMaxColLen());
// cast it to long to get rid of periodic decimal
Assert.assertEquals("avgColLen b", (long) 6.1111111111, (long) colBStats.getAvgColLen());
Assert.assertEquals("numNulls b", 0, colBStats.getNumNulls());
Assert.assertEquals("nunDVs", 8, colBStats.getNumDVs());
// now check that stats for partition we didn't modify did not change
stats = msClient.getPartitionColumnStatistics(ciPart2.dbname, ciPart2.tableName, Arrays.asList(ciPart2.partName), colNames);
colStats = stats.get(ciPart2.partName);
Assert.assertEquals("Expected stats for " + ciPart2.partName + " to stay the same", colAStatsPart2, colStats.get(0).getStatsData().getLongStats());
Assert.assertEquals("Expected stats for " + ciPart2.partName + " to stay the same", colBStatsPart2, colStats.get(1).getStatsData().getStringStats());
}
use of org.apache.hadoop.hive.metastore.api.ShowCompactResponse in project hive by apache.
the class TestCompactor method testTableProperties.
/**
* Users have the choice of specifying compaction related tblproperties either in CREATE TABLE
* statement or in ALTER TABLE .. COMPACT statement. This tests both cases.
*/
@Test
public void testTableProperties() throws Exception {
// plain acid table
String tblName1 = "ttp1";
// acid table with customized tblproperties
String tblName2 = "ttp2";
executeStatementOnDriver("drop table if exists " + tblName1, driver);
executeStatementOnDriver("drop table if exists " + tblName2, driver);
executeStatementOnDriver("CREATE TABLE " + tblName1 + "(a INT, b STRING) " + " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC" + " TBLPROPERTIES ('transactional'='true', 'orc.compress.size'='2700')", driver);
executeStatementOnDriver("CREATE TABLE " + tblName2 + "(a INT, b STRING) " + " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES (" + "'transactional'='true'," + // 2048 MB memory for compaction map job
"'compactor.mapreduce.map.memory.mb'='2048'," + // minor compaction if more than 4 delta dirs
"'compactorthreshold.hive.compactor.delta.num.threshold'='4'," + // major compaction if more than 49%
"'compactorthreshold.hive.compactor.delta.pct.threshold'='0.49'" + ")", driver);
// Insert 5 rows to both tables
executeStatementOnDriver("insert into " + tblName1 + " values (1, 'a')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (2, 'b')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (3, 'c')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (4, 'd')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (5, 'e')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (1, 'a')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (2, 'b')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (3, 'c')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (4, 'd')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (5, 'e')", driver);
runInitiator(conf);
// Compactor should only schedule compaction for ttp2 (delta.num.threshold=4), not ttp1
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(1, rsp.getCompacts().size());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
// type is MAJOR since there's no base yet
Assert.assertEquals(CompactionType.MAJOR, rsp.getCompacts().get(0).getType());
// Finish the scheduled compaction for ttp2, and manually compact ttp1, to make them comparable again
executeStatementOnDriver("alter table " + tblName1 + " compact 'major'", driver);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(2, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp1", rsp.getCompacts().get(1).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(1).getState());
// compact ttp2, by running the Worker explicitly, in order to get the reference to the compactor MR job
AtomicBoolean stop = new AtomicBoolean(true);
Worker t = new Worker();
t.setThreadId((int) t.getId());
t.setConf(conf);
AtomicBoolean looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
JobConf job = t.getMrJob();
// 2048 comes from tblproperties
Assert.assertEquals(2048, job.getMemoryForMapTask());
// Compact ttp1
stop = new AtomicBoolean(true);
t = new Worker();
t.setThreadId((int) t.getId());
t.setConf(conf);
looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
job = t.getMrJob();
// 1024 is the default value
Assert.assertEquals(1024, job.getMemoryForMapTask());
// Clean up
runCleaner(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(2, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp1", rsp.getCompacts().get(1).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(1).getState());
/**
* we just did a major compaction on ttp1. Open any file produced by it and check buffer size.
* It should be the default.
*/
List<String> rs = execSelectAndDumpData("select distinct INPUT__FILE__NAME from " + tblName1, driver, "Find Orc File bufer default");
Assert.assertTrue("empty rs?", rs != null && rs.size() > 0);
Path p = new Path(rs.get(0));
Reader orcReader = OrcFile.createReader(p.getFileSystem(conf), p);
Assert.assertEquals("Expected default compression size", 2700, orcReader.getCompressionSize());
// make sure 2700 is not the default so that we are testing if tblproperties indeed propagate
Assert.assertNotEquals("Unexpected default compression size", 2700, OrcConf.BUFFER_SIZE.getDefaultValue());
// Insert one more row - this should trigger hive.compactor.delta.pct.threshold to be reached for ttp2
executeStatementOnDriver("insert into " + tblName1 + " values (6, 'f')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (6, 'f')", driver);
// Intentionally set this high so that it will not trigger major compaction for ttp1.
// Only trigger major compaction for ttp2 (delta.pct.threshold=0.5) because of the newly inserted row (actual pct: 0.66)
conf.setFloatVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_PCT_THRESHOLD, 0.8f);
runInitiator(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(3, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
// Finish the scheduled compaction for ttp2
runWorker(conf);
runCleaner(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(3, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(0).getState());
// Now test tblproperties specified on ALTER TABLE .. COMPACT .. statement
executeStatementOnDriver("insert into " + tblName2 + " values (7, 'g')", driver);
executeStatementOnDriver("alter table " + tblName2 + " compact 'major'" + " with overwrite tblproperties (" + "'compactor.mapreduce.map.memory.mb'='3072'," + "'tblprops.orc.compress.size'='3141')", driver);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(4, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
// make sure we are checking the right (latest) compaction entry
Assert.assertEquals(4, rsp.getCompacts().get(0).getId());
// Run the Worker explicitly, in order to get the reference to the compactor MR job
stop = new AtomicBoolean(true);
t = new Worker();
t.setThreadId((int) t.getId());
t.setConf(conf);
looped = new AtomicBoolean();
t.init(stop, looped);
t.run();
job = t.getMrJob();
Assert.assertEquals(3072, job.getMemoryForMapTask());
Assert.assertTrue(job.get("hive.compactor.table.props").contains("orc.compress.size4:3141"));
/*createReader(FileSystem fs, Path path) throws IOException {
*/
// we just ran Major compaction so we should have a base_x in tblName2 that has the new files
// Get the name of a file and look at its properties to see if orc.compress.size was respected.
rs = execSelectAndDumpData("select distinct INPUT__FILE__NAME from " + tblName2, driver, "Find Compacted Orc File");
Assert.assertTrue("empty rs?", rs != null && rs.size() > 0);
p = new Path(rs.get(0));
orcReader = OrcFile.createReader(p.getFileSystem(conf), p);
Assert.assertEquals("File written with wrong buffer size", 3141, orcReader.getCompressionSize());
}
use of org.apache.hadoop.hive.metastore.api.ShowCompactResponse in project hive by apache.
the class TestCompactor method dynamicPartitioningDelete.
@Test
public void dynamicPartitioningDelete() throws Exception {
String tblName = "ddpct";
List<String> colNames = Arrays.asList("a", "b");
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 2 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + "'today'), (2, 'wilma', 'yesterday')", driver);
executeStatementOnDriver("update " + tblName + " set b = 'fred' where a = 1", driver);
executeStatementOnDriver("delete from " + tblName + " where b = 'fred'", driver);
Initiator initiator = new Initiator();
initiator.setThreadId((int) initiator.getId());
// Set to 2 so insert and update don't set it off but delete does
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 2);
initiator.setConf(conf);
AtomicBoolean stop = new AtomicBoolean();
stop.set(true);
initiator.init(stop, new AtomicBoolean());
initiator.run();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(1, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
for (int i = 0; i < compacts.size(); i++) {
Assert.assertEquals("default", compacts.get(i).getDbname());
Assert.assertEquals(tblName, compacts.get(i).getTablename());
Assert.assertEquals("initiated", compacts.get(i).getState());
partNames.add(compacts.get(i).getPartitionname());
}
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=today", names.get(0));
}
Aggregations