use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testInputConfigurationFailure.
private void testInputConfigurationFailure(Map<String, String> arguments, final String why) throws Exception {
final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
TransactionAware txAwareDataset = (TransactionAware) dataset;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
try {
dataset.getInputFormatConfiguration();
Assert.fail("getInputFormatConfiguration should fail " + why);
} catch (Exception e) {
// expected
}
}
});
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testInputPartitionPaths.
/**
* Tests that the TPFS sets the file input paths correctly for the input time range.
*/
@Test
public void testInputPartitionPaths() throws Exception {
// make sure the dataset has no partitions
final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
TransactionAware txAwareDataset = (TransactionAware) tpfs;
TransactionExecutor txnl = dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset);
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
validateTimePartitions(tpfs, 0L, MAX, Collections.<Long, String>emptyMap());
}
});
Date date = DATE_FORMAT.parse("6/4/12 10:00 am");
final long time = date.getTime();
txnl.execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
tpfs.addPartition(time, "file");
tpfs.addPartition(time + 5 * MINUTE, "file5");
tpfs.addPartition(time + 10 * MINUTE, "file10");
tpfs.addPartition(time + 12 * MINUTE, "file12");
}
});
validateInputPaths(time, -10, -5);
validateInputPaths(time, -10, 2, "file");
validateInputPaths(time, 1, 11, "file5", "file10");
validateInputPaths(time, 1, 15, "file5", "file10", "file12");
validateInputPaths(time, 5, 10, "file5");
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testAddGetPartitions.
@Test
public void testAddGetPartitions() throws Exception {
final TimePartitionedFileSet fileSet = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
TransactionAware txAwareDataset = (TransactionAware) fileSet;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// this is an arbitrary data to use as the test time
long time = DATE_FORMAT.parse("12/10/14 5:10 am").getTime();
long time2 = time + HOUR;
String firstPath = "first/partition";
String secondPath = "second/partition";
// make sure the file set has no partitions initially
validateTimePartition(fileSet, time, null);
validateTimePartitions(fileSet, 0L, MAX, Collections.<Long, String>emptyMap());
// add a partition, verify getPartition() works
fileSet.addPartition(time, firstPath);
validateTimePartition(fileSet, time, firstPath);
Map<Long, String> expectNone = Collections.emptyMap();
Map<Long, String> expectFirst = ImmutableMap.of(time, firstPath);
Map<Long, String> expectSecond = ImmutableMap.of(time2, secondPath);
Map<Long, String> expectBoth = ImmutableMap.of(time, firstPath, time2, secondPath);
// verify various ways to list partitions with various ranges
validateTimePartitions(fileSet, time + MINUTE, MAX, expectNone);
validateTimePartitions(fileSet, 0L, time, expectNone);
validateTimePartitions(fileSet, 0L, MAX, expectFirst);
validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
validateTimePartitions(fileSet, 0L, time + HOUR, expectFirst);
validateTimePartitions(fileSet, time - HOUR, time + HOUR, expectFirst);
// add and verify another partition
fileSet.addPartition(time2, secondPath);
validateTimePartition(fileSet, time2, secondPath);
// verify various ways to list partitions with various ranges
validateTimePartitions(fileSet, 0L, MAX, expectBoth);
validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectFirst);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectBoth);
// try to add another partition with the same key
try {
fileSet.addPartition(time2, "third/partition");
Assert.fail("Should have thrown Exception for duplicate partition");
} catch (DataSetException e) {
//expected
}
// remove first partition and validate
fileSet.dropPartition(time);
validateTimePartition(fileSet, time, null);
// verify various ways to list partitions with various ranges
validateTimePartitions(fileSet, 0L, MAX, expectSecond);
validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectNone);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
// try to delete another partition with the same key
try {
fileSet.dropPartition(time);
} catch (DataSetException e) {
Assert.fail("Should not have have thrown Exception for removing non-existent partition");
}
}
});
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testTimePartitionedFileSet.
@Test
public void testTimePartitionedFileSet() throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parts");
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(tpfs);
Assert.assertTrue(tpfs instanceof TransactionAware);
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
long time2 = DATE_FORMAT.parse("12/10/14 2:00 am").getTime();
long time3 = DATE_FORMAT.parse("12/10/14 3:00 am").getTime();
Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
Location location2 = tpfs.getEmbeddedFileSet().getLocation("file2/nn");
Location location3 = tpfs.getEmbeddedFileSet().getLocation("file3/nn");
FileWriterHelper.generateAvroFile(location1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(location2.getOutputStream(), "y", 2, 3);
FileWriterHelper.generateAvroFile(location3.getOutputStream(), "x", 3, 4);
addTimePartition(tpfs, time1, "file1");
addTimePartition(tpfs, time2, "file2");
addTimePartition(tpfs, time3, "file3");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=2/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3")), new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " WHERE hour = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// remove a partition
dropTimePartition(tpfs, time2);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3"))));
// verify the partition was removed from Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class MapReduceWithPartitionedTest method testTimePartitionedWithMR.
@Test
public void testTimePartitionedWithMR() throws Exception {
final ApplicationWithPrograms app = deployApp(AppWithTimePartitionedFileSet.class);
// write a value to the input table
final Table table = datasetCache.getDataset(AppWithTimePartitionedFileSet.INPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.put(Bytes.toBytes("x"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
}
});
final long time = DATE_FORMAT.parse("1/15/15 11:15 am").getTime();
final long time5 = time + TimeUnit.MINUTES.toMillis(5);
// run the partition writer m/r with this output partition time
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time);
final ImmutableMap<String, String> assignedMetadata = ImmutableMap.of("region", "13", "data.source.name", "input", "data.source.type", "table");
TimePartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, assignedMetadata);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
final TimePartitionedFileSet tpfs = datasetCache.getDataset(TIME_PARTITIONED);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
TimePartitionDetail partition = tpfs.getPartitionByTime(time);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("2015-01-15/11-15"));
Assert.assertEquals(assignedMetadata, partition.getMetadata().asMap());
}
});
// delete the data in the input table and write a new row
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.delete(Bytes.toBytes("x"));
table.put(Bytes.toBytes("y"), AppWithTimePartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
}
});
// now run the m/r again with a new partition time, say 5 minutes later
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time5);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs));
// make the mapreduce add the partition in destroy, to validate that this does not fail the job
runtimeArguments.put(AppWithTimePartitionedFileSet.COMPAT_ADD_PARTITION, "true");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) tpfs).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = tpfs.getPartitionByTime(time5);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("2015-01-15/11-20"));
}
});
// now run a map/reduce that reads all the partitions
runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read both partitions - and written both x and y to row a
final Table output = datasetCache.getDataset(AppWithTimePartitionedFileSet.OUTPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("a"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("2", row.getString("y"));
}
});
// now run a map/reduce that reads a range of the partitions, namely the first one
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5));
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read the first partition only - and written only x to row b
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("b"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertNull(row.get("y"));
}
});
// now run a map/reduce that reads no partitions (because the range matches nothing)
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(10));
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9));
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs));
runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");
Assert.assertTrue(runProgram(app, AppWithTimePartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read no partitions - and written nothing to row n
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("n"));
Assert.assertTrue(row.isEmpty());
}
});
}
Aggregations