use of org.apache.hadoop.hive.metastore.txn.TxnStore in project hive by apache.
the class TestCompactor method dynamicPartitioningDelete.
@Test
public void dynamicPartitioningDelete() throws Exception {
String tblName = "ddpct";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(ds string)" + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 2 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("insert into " + tblName + " partition (ds) values (1, 'fred', " + "'today'), (2, 'wilma', 'yesterday')", driver);
executeStatementOnDriver("update " + tblName + " set b = 'fred' where a = 1", driver);
executeStatementOnDriver("delete from " + tblName + " where b = 'fred'", driver);
// Set to 2 so insert and update don't set it off but delete does
conf.setIntVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_NUM_THRESHOLD, 2);
runInitiator(conf);
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
Assert.assertEquals(1, compacts.size());
SortedSet<String> partNames = new TreeSet<String>();
verifyCompactions(compacts, partNames, tblName);
List<String> names = new ArrayList<String>(partNames);
Assert.assertEquals("ds=today", names.get(0));
}
use of org.apache.hadoop.hive.metastore.txn.TxnStore in project hive by apache.
the class TestCompactor method testSkippedCompactionCleanerKeepsAborted.
/**
* There is a special case handled in Compaction Worker that will skip compaction
* if there is only one valid delta. But this compaction will be still cleaned up, if there are aborted directories.
* @see Worker.isEnoughToCompact
* However if no compaction was done, deltas containing mixed aborted / committed writes from streaming can not be cleaned
* and the metadata belonging to those aborted transactions can not be removed.
* @throws Exception ex
*/
@Test
public void testSkippedCompactionCleanerKeepsAborted() throws Exception {
String dbName = "default";
String tblName = "cws";
String agentInfo = "UT_" + Thread.currentThread().getName();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(b STRING) " + " PARTITIONED BY (a INT) STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("alter table " + tblName + " add partition(a=1)", driver);
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
// Create initial aborted txn
HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Collections.singletonList("1")).withAgentInfo(agentInfo).withHiveConf(conf).withRecordWriter(writer).withStreamingOptimizations(true).withTransactionBatchSize(1).connect();
connection.beginTransaction();
connection.write("3,1".getBytes());
connection.write("4,1".getBytes());
connection.abortTransaction();
connection.close();
// Create a sequence of commit, abort, commit to the same delta folder
connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Collections.singletonList("1")).withAgentInfo(agentInfo).withHiveConf(conf).withRecordWriter(writer).withStreamingOptimizations(true).withTransactionBatchSize(3).connect();
connection.beginTransaction();
connection.write("1,1".getBytes());
connection.write("2,1".getBytes());
connection.commitTransaction();
connection.beginTransaction();
connection.write("3,1".getBytes());
connection.write("4,1".getBytes());
connection.abortTransaction();
connection.beginTransaction();
connection.write("5,1".getBytes());
connection.write("6,1".getBytes());
connection.commitTransaction();
connection.close();
// Check that aborted are not read back
driver.run("select * from cws");
List res = new ArrayList();
driver.getFetchTask().fetch(res);
Assert.assertEquals(4, res.size());
int count = TestTxnDbUtil.countQueryAgent(conf, "select count(*) from TXN_COMPONENTS");
Assert.assertEquals("There should be 2 record for two aborted transaction", 2, count);
// Start a compaction, that will be skipped, because only one valid delta is there
driver.run("alter table cws partition(a='1') compact 'minor'");
runWorker(conf);
// Cleaner should not delete info about aborted txn 2
runCleaner(conf);
txnHandler.cleanEmptyAbortedAndCommittedTxns();
count = TestTxnDbUtil.countQueryAgent(conf, "select count(*) from TXN_COMPONENTS");
Assert.assertEquals("There should be 1 record for the second aborted transaction", 1, count);
driver.run("select * from cws");
res.clear();
driver.getFetchTask().fetch(res);
Assert.assertEquals(4, res.size());
}
use of org.apache.hadoop.hive.metastore.txn.TxnStore in project hive by apache.
the class TestCompactor method testStatsAfterCompactionPartTbl.
/**
* After each major compaction, stats need to be updated on the table
* 1. create a partitioned ORC backed table (Orc is currently required by ACID)
* 2. populate with data
* 3. compute stats
* 4. Trigger major compaction on one of the partitions (which should update stats)
* 5. check that stats have been updated for that partition only
*
* @throws Exception todo:
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
// as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String dbName = "default";
String tblName = "compaction_test";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Arrays.asList("0")).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).connect();
connection.beginTransaction();
connection.write("55, 'London'".getBytes());
connection.commitTransaction();
connection.beginTransaction();
connection.write("56, 'Paris'".getBytes());
connection.commitTransaction();
connection.close();
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(57, 'Budapest')", driver);
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(58, 'Milano')", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
Table table = msClient.getTable(dbName, tblName);
// compute stats before compaction
CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0", CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
// Check basic stats are collected
org.apache.hadoop.hive.ql.metadata.Table hiveTable = Hive.get().getTable(tblName);
List<org.apache.hadoop.hive.ql.metadata.Partition> partitions = Hive.get().getPartitions(hiveTable);
Map<String, String> parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1373", parameters.get("totalSize"));
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
// Do a major compaction
CompactionRequest rqst = new CompactionRequest(dbName, tblName, CompactionType.MAJOR);
rqst.setPartitionname("bkt=0");
txnHandler.compact(rqst);
runWorker(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
// Check basic stats are updated for partition bkt=0, but not updated for partition bkt=1
partitions = Hive.get().getPartitions(hiveTable);
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "801", parameters.get("totalSize"));
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
}
use of org.apache.hadoop.hive.metastore.txn.TxnStore in project hive by apache.
the class TestCrudCompactorOnTez method testStatsAfterQueryCompactionOnTez.
/**
* After each major compaction, stats need to be updated on the table
* 1. create an ORC backed table (Orc is currently required by ACID)
* 2. populate with data
* 3. compute stats
* 4. Trigger major compaction (which should update stats)
* 5. check that stats have been updated
*/
@Test
public void testStatsAfterQueryCompactionOnTez() throws Exception {
// as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String dbName = "default";
String tblName = "compaction_test";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " values(55, 'London')", driver);
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " values(56, 'Paris')", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
Table table = msClient.getTable(dbName, tblName);
// compute stats before compaction
CompactionInfo ci = new CompactionInfo(dbName, tblName, null, CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
// Check basic stats are collected
Map<String, String> parameters = Hive.get().getTable(tblName).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1434", parameters.get("totalSize"));
// Do a major compaction
CompactorTestUtil.runCompaction(conf, dbName, tblName, CompactionType.MAJOR, true);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
// Check basic stats are updated
parameters = Hive.get().getTable(tblName).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "727", parameters.get("totalSize"));
}
use of org.apache.hadoop.hive.metastore.txn.TxnStore in project hive by apache.
the class TestCompactor method testTableProperties.
/**
* Users have the choice of specifying compaction related tblproperties either in CREATE TABLE
* statement or in ALTER TABLE .. COMPACT statement. This tests both cases.
*/
@Test
public void testTableProperties() throws Exception {
conf.setVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE, "root.user1");
// plain acid table
String tblName1 = "ttp1";
// acid table with customized tblproperties
String tblName2 = "ttp2";
executeStatementOnDriver("drop table if exists " + tblName1, driver);
executeStatementOnDriver("drop table if exists " + tblName2, driver);
executeStatementOnDriver("CREATE TABLE " + tblName1 + "(a INT, b STRING) " + " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC" + " TBLPROPERTIES ('transactional'='true', 'orc.compress.size'='2700')", driver);
executeStatementOnDriver("CREATE TABLE " + tblName2 + "(a INT, b STRING) " + " CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES (" + "'transactional'='true'," + // 2048 MB memory for compaction map job
"'compactor.mapreduce.map.memory.mb'='2048'," + // minor compaction if more than 4 delta dirs
"'compactorthreshold.hive.compactor.delta.num.threshold'='4'," + // major compaction if more than 47%
"'compactorthreshold.hive.compactor.delta.pct.threshold'='0.47'," + // Override the system wide compactor queue for this table
"'compactor.hive.compactor.job.queue'='root.user2'" + ")", driver);
// Insert 5 rows to both tables
executeStatementOnDriver("insert into " + tblName1 + " values (1, 'a')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (2, 'b')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (3, 'c')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (4, 'd')", driver);
executeStatementOnDriver("insert into " + tblName1 + " values (5, 'e')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (1, 'a')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (2, 'b')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (3, 'c')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (4, 'd')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (5, 'e')", driver);
runInitiator(conf);
// Compactor should only schedule compaction for ttp2 (delta.num.threshold=4), not ttp1
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(1, rsp.getCompacts().size());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(CompactionType.MAJOR, // type is MAJOR since there's no base yet
rsp.getCompacts().get(0).getType());
// Finish the scheduled compaction for ttp2, and manually compact ttp1, to make them comparable again
executeStatementOnDriver("alter table " + tblName1 + " compact 'major'", driver);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(2, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp1", rsp.getCompacts().get(1).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(1).getState());
// compact ttp2, by running the Worker explicitly, in order to get the reference to the compactor MR job
runWorker(conf);
// Compact ttp1
runWorker(conf);
// Clean up
runCleaner(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(2, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(0).getState());
Assert.assertEquals("ttp1", rsp.getCompacts().get(1).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(1).getState());
/**
* we just did a major compaction on ttp1. Open any file produced by it and check buffer size.
* It should be the default.
*/
List<String> rs = execSelectAndDumpData("select distinct INPUT__FILE__NAME from " + tblName1, driver, "Find Orc File bufer default");
Assert.assertTrue("empty rs?", rs != null && rs.size() > 0);
Path p = new Path(rs.get(0));
try (Reader orcReader = OrcFile.createReader(p.getFileSystem(conf), p)) {
Assert.assertEquals("Expected default compression size", 2700, orcReader.getCompressionSize());
}
// make sure 2700 is not the default so that we are testing if tblproperties indeed propagate
Assert.assertNotEquals("Unexpected default compression size", 2700, OrcConf.BUFFER_SIZE.getDefaultValue());
// Insert one more row - this should trigger hive.compactor.delta.pct.threshold to be reached for ttp2
executeStatementOnDriver("insert into " + tblName1 + " values (6, 'f')", driver);
executeStatementOnDriver("insert into " + tblName2 + " values (6, 'f')", driver);
// Intentionally set this high so that it will not trigger major compaction for ttp1.
// Only trigger major compaction for ttp2 (delta.pct.threshold=0.5) because of the newly inserted row (actual pct: 0.66)
conf.setFloatVar(HiveConf.ConfVars.HIVE_COMPACTOR_DELTA_PCT_THRESHOLD, 0.8f);
runInitiator(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(3, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
// Finish the scheduled compaction for ttp2
runWorker(conf);
runCleaner(conf);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(3, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.SUCCEEDED_RESPONSE, rsp.getCompacts().get(0).getState());
// Now test tblproperties specified on ALTER TABLE .. COMPACT .. statement
executeStatementOnDriver("insert into " + tblName2 + " values (7, 'g')", driver);
executeStatementOnDriver("alter table " + tblName2 + " compact 'major'" + " with overwrite tblproperties (" + "'compactor.mapreduce.map.memory.mb'='3072'," + "'tblprops.orc.compress.size'='3141'," + "'compactor.hive.compactor.job.queue'='root.user2')", driver);
rsp = txnHandler.showCompact(new ShowCompactRequest());
Assert.assertEquals(4, rsp.getCompacts().size());
Assert.assertEquals("ttp2", rsp.getCompacts().get(0).getTablename());
Assert.assertEquals(TxnStore.INITIATED_RESPONSE, rsp.getCompacts().get(0).getState());
// make sure we are checking the right (latest) compaction entry
Assert.assertEquals(4, rsp.getCompacts().get(0).getId());
// Run the Worker explicitly, in order to get the reference to the compactor MR job
runWorker(conf);
/*createReader(FileSystem fs, Path path) throws IOException {
*/
// we just ran Major compaction so we should have a base_x in tblName2 that has the new files
// Get the name of a file and look at its properties to see if orc.compress.size was respected.
rs = execSelectAndDumpData("select distinct INPUT__FILE__NAME from " + tblName2, driver, "Find Compacted Orc File");
Assert.assertTrue("empty rs?", rs != null && rs.size() > 0);
p = new Path(rs.get(0));
try (Reader orcReader = OrcFile.createReader(p.getFileSystem(conf), p)) {
Assert.assertEquals("File written with wrong buffer size", 3141, orcReader.getCompressionSize());
}
}
Aggregations