use of org.apache.hive.streaming.StrictDelimitedInputWriter in project hive by apache.
the class TestCompactor method testSkippedCompactionCleanerKeepsAborted.
/**
* There is a special case handled in Compaction Worker that will skip compaction
* if there is only one valid delta. But this compaction will be still cleaned up, if there are aborted directories.
* @see Worker.isEnoughToCompact
* However if no compaction was done, deltas containing mixed aborted / committed writes from streaming can not be cleaned
* and the metadata belonging to those aborted transactions can not be removed.
* @throws Exception ex
*/
@Test
public void testSkippedCompactionCleanerKeepsAborted() throws Exception {
String dbName = "default";
String tblName = "cws";
String agentInfo = "UT_" + Thread.currentThread().getName();
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(b STRING) " + " PARTITIONED BY (a INT) STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
executeStatementOnDriver("alter table " + tblName + " add partition(a=1)", driver);
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
// Create initial aborted txn
HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Collections.singletonList("1")).withAgentInfo(agentInfo).withHiveConf(conf).withRecordWriter(writer).withStreamingOptimizations(true).withTransactionBatchSize(1).connect();
connection.beginTransaction();
connection.write("3,1".getBytes());
connection.write("4,1".getBytes());
connection.abortTransaction();
connection.close();
// Create a sequence of commit, abort, commit to the same delta folder
connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Collections.singletonList("1")).withAgentInfo(agentInfo).withHiveConf(conf).withRecordWriter(writer).withStreamingOptimizations(true).withTransactionBatchSize(3).connect();
connection.beginTransaction();
connection.write("1,1".getBytes());
connection.write("2,1".getBytes());
connection.commitTransaction();
connection.beginTransaction();
connection.write("3,1".getBytes());
connection.write("4,1".getBytes());
connection.abortTransaction();
connection.beginTransaction();
connection.write("5,1".getBytes());
connection.write("6,1".getBytes());
connection.commitTransaction();
connection.close();
// Check that aborted are not read back
driver.run("select * from cws");
List res = new ArrayList();
driver.getFetchTask().fetch(res);
Assert.assertEquals(4, res.size());
int count = TestTxnDbUtil.countQueryAgent(conf, "select count(*) from TXN_COMPONENTS");
Assert.assertEquals("There should be 2 record for two aborted transaction", 2, count);
// Start a compaction, that will be skipped, because only one valid delta is there
driver.run("alter table cws partition(a='1') compact 'minor'");
runWorker(conf);
// Cleaner should not delete info about aborted txn 2
runCleaner(conf);
txnHandler.cleanEmptyAbortedAndCommittedTxns();
count = TestTxnDbUtil.countQueryAgent(conf, "select count(*) from TXN_COMPONENTS");
Assert.assertEquals("There should be 1 record for the second aborted transaction", 1, count);
driver.run("select * from cws");
res.clear();
driver.getFetchTask().fetch(res);
Assert.assertEquals(4, res.size());
}
use of org.apache.hive.streaming.StrictDelimitedInputWriter in project hive by apache.
the class TestCompactor method testStatsAfterCompactionPartTbl.
/**
* After each major compaction, stats need to be updated on the table
* 1. create a partitioned ORC backed table (Orc is currently required by ACID)
* 2. populate with data
* 3. compute stats
* 4. Trigger major compaction on one of the partitions (which should update stats)
* 5. check that stats have been updated for that partition only
*
* @throws Exception todo:
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
// as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String dbName = "default";
String tblName = "compaction_test";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Arrays.asList("0")).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).connect();
connection.beginTransaction();
connection.write("55, 'London'".getBytes());
connection.commitTransaction();
connection.beginTransaction();
connection.write("56, 'Paris'".getBytes());
connection.commitTransaction();
connection.close();
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(57, 'Budapest')", driver);
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(58, 'Milano')", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
Table table = msClient.getTable(dbName, tblName);
// compute stats before compaction
CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0", CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
// Check basic stats are collected
org.apache.hadoop.hive.ql.metadata.Table hiveTable = Hive.get().getTable(tblName);
List<org.apache.hadoop.hive.ql.metadata.Partition> partitions = Hive.get().getPartitions(hiveTable);
Map<String, String> parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1373", parameters.get("totalSize"));
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
// Do a major compaction
CompactionRequest rqst = new CompactionRequest(dbName, tblName, CompactionType.MAJOR);
rqst.setPartitionname("bkt=0");
txnHandler.compact(rqst);
runWorker(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
// Check basic stats are updated for partition bkt=0, but not updated for partition bkt=1
partitions = Hive.get().getPartitions(hiveTable);
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "801", parameters.get("totalSize"));
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
}
use of org.apache.hive.streaming.StrictDelimitedInputWriter in project hive by apache.
the class TestCrudCompactorOnTez method testMinorCompactionWhileStreamingWithAbortInMiddle.
@Test
public void testMinorCompactionWhileStreamingWithAbortInMiddle() throws Exception {
String dbName = "default";
String tableName = "testMinorCompaction";
executeStatementOnDriver("drop table if exists " + tableName, driver);
executeStatementOnDriver("CREATE TABLE " + tableName + "(a INT, b STRING) " + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
StreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tableName).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).connect();
connection.beginTransaction();
connection.write("50,Kiev".getBytes());
connection.write("51,St. Petersburg".getBytes());
connection.write("52,Boston".getBytes());
connection.commitTransaction();
connection.beginTransaction();
connection.write("60,Budapest".getBytes());
connection.abortTransaction();
connection.beginTransaction();
connection.write("71,Szeged".getBytes());
connection.write("72,Debrecen".getBytes());
connection.commitTransaction();
connection.close();
CompactorTestUtil.runCompaction(conf, dbName, tableName, CompactionType.MINOR, true);
CompactorTestUtil.runCleaner(conf);
// Find the location of the table
IMetaStoreClient metaStoreClient = new HiveMetaStoreClient(conf);
Table table = metaStoreClient.getTable(dbName, tableName);
FileSystem fs = FileSystem.get(conf);
Assert.assertEquals("Delta names does not match", Collections.singletonList("delta_0000001_0000003_v0000006"), CompactorTestUtil.getBaseOrDeltaNames(fs, null, table, null));
CompactorTestUtil.checkExpectedTxnsPresent(null, new Path[] { new Path(table.getSd().getLocation(), "delta_0000001_0000003_v0000006") }, "a,b", "int:string", 0, 1L, 3L, Lists.newArrayList(2), 1);
}
use of org.apache.hive.streaming.StrictDelimitedInputWriter in project hive by apache.
the class CompactorTestUtil method writeBatch.
/**
* Open a hive streaming connection, write some content in two transactions.
* @param conf hive configuration
* @param dbName name of the database
* @param tblName name of the table
* @param abort abort all transactions in connection
* @param keepOpen keep the streaming connection open after the transaction has been committed
* @return open streaming connection, can be null
* @throws StreamingException streaming connection cannot be established
*/
static StreamingConnection writeBatch(HiveConf conf, String dbName, String tblName, boolean abort, boolean keepOpen) throws StreamingException {
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
StreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).withTransactionBatchSize(2).connect();
connection.beginTransaction();
if (abort) {
connection.abortTransaction();
} else {
connection.write("50,Kiev".getBytes());
connection.write("51,St. Petersburg".getBytes());
connection.write("44,Boston".getBytes());
connection.commitTransaction();
}
if (!keepOpen) {
connection.beginTransaction();
if (abort) {
connection.abortTransaction();
} else {
connection.write("52,Tel Aviv".getBytes());
connection.write("53,Atlantis".getBytes());
connection.write("53,Boston".getBytes());
connection.commitTransaction();
}
connection.close();
return null;
}
return connection;
}
use of org.apache.hive.streaming.StrictDelimitedInputWriter in project hive by apache.
the class TestReplicationOfHiveStreaming method testHiveStreamingUnpartitionedWithTxnBatchSizeAsOne.
@Test
public void testHiveStreamingUnpartitionedWithTxnBatchSizeAsOne() throws Throwable {
primary.dump(primaryDbName);
replica.loadWithoutExplain(replicatedDbName, primaryDbName);
// Create an ACID table.
String tblName = "alerts";
primary.run("use " + primaryDbName).run("create table " + tblName + "( id int , msg string ) " + "clustered by (id) into 5 buckets " + "stored as orc tblproperties(\"transactional\"=\"true\")");
// Create delimited record writer whose schema exactly matches table schema
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
// Create and open streaming connection (default.src table has to exist already)
// By default, txn batch size is 1.
StreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(primaryDbName).withTable(tblName).withAgentInfo("example-agent-1").withRecordWriter(writer).withHiveConf(primary.getConf()).connect();
// Begin a transaction, write records and commit 1st transaction
connection.beginTransaction();
connection.write("1,val1".getBytes());
connection.write("2,val2".getBytes());
connection.commitTransaction();
// Replicate the committed data which should be visible.
primary.dump(primaryDbName);
replica.loadWithoutExplain(replicatedDbName, primaryDbName).run("use " + replicatedDbName).run("select msg from " + tblName + " order by msg").verifyResults((new String[] { "val1", "val2" }));
// Begin another transaction, write more records and commit 2nd transaction after REPL LOAD.
connection.beginTransaction();
connection.write("3,val3".getBytes());
connection.write("4,val4".getBytes());
// Replicate events before committing txn. The uncommitted data shouldn't be seen.
primary.dump(primaryDbName);
replica.loadWithoutExplain(replicatedDbName, primaryDbName).run("use " + replicatedDbName).run("select msg from " + tblName + " order by msg").verifyResults((new String[] { "val1", "val2" }));
connection.commitTransaction();
// After commit, the data should be replicated and visible.
primary.dump(primaryDbName);
replica.loadWithoutExplain(replicatedDbName, primaryDbName).run("use " + replicatedDbName).run("select msg from " + tblName + " order by msg").verifyResults((new String[] { "val1", "val2", "val3", "val4" }));
// Begin another transaction, write more records and abort 3rd transaction
connection.beginTransaction();
connection.write("5,val5".getBytes());
connection.write("6,val6".getBytes());
connection.abortTransaction();
// Aborted data shouldn't be visible.
primary.dump(primaryDbName);
replica.loadWithoutExplain(replicatedDbName, primaryDbName).run("use " + replicatedDbName).run("select msg from " + tblName + " order by msg").verifyResults((new String[] { "val1", "val2", "val3", "val4" }));
// Close the streaming connection
connection.close();
}
Aggregations