use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testTimePartitionedFileSet.
@Test
public void testTimePartitionedFileSet() throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parts");
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(tpfs);
Assert.assertTrue(tpfs instanceof TransactionAware);
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
long time2 = DATE_FORMAT.parse("12/10/14 2:00 am").getTime();
long time3 = DATE_FORMAT.parse("12/10/14 3:00 am").getTime();
Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
Location location2 = tpfs.getEmbeddedFileSet().getLocation("file2/nn");
Location location3 = tpfs.getEmbeddedFileSet().getLocation("file3/nn");
FileWriterHelper.generateAvroFile(location1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(location2.getOutputStream(), "y", 2, 3);
FileWriterHelper.generateAvroFile(location3.getOutputStream(), "x", 3, 4);
addTimePartition(tpfs, time1, "file1");
addTimePartition(tpfs, time2, "file2");
addTimePartition(tpfs, time3, "file3");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=2/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3")), new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " WHERE hour = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("y2", "#2"))));
// remove a partition
dropTimePartition(tpfs, time2);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT key, value FROM " + tableName + " ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc("key", "STRING", 1, null), new ColumnDesc("value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", "#1")), new QueryResult(Lists.<Object>newArrayList("x3", "#3"))));
// verify the partition was removed from Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=1/minute=0")), new QueryResult(Lists.<Object>newArrayList("year=2014/month=12/day=10/hour=3/minute=0"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testTPFSWithDateTimestamp.
@Test
public void testTPFSWithDateTimestamp() throws Exception {
TimeZone.setDefault(TimeZone.getTimeZone("UTC"));
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("dtfs");
final String tableName = getDatasetHiveName(datasetInstanceId);
final Schema dtSchema = Schema.recordOf("dt", Schema.Field.of("id", Schema.of(Schema.Type.INT)), Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("dt", Schema.of(Schema.LogicalType.DATE)), Schema.Field.of("ts", Schema.nullableOf(Schema.of(Schema.LogicalType.TIMESTAMP_MILLIS))));
// create a file set
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", dtSchema.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.newArrayList(tableName))));
// Accessing dataset instance to perform data operations
TimePartitionedFileSet tpfs = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(tpfs);
Location location1 = tpfs.getEmbeddedFileSet().getLocation("file1/nn");
generateAvroFile(location1.getOutputStream(), dtSchema);
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
long time1 = DATE_FORMAT.parse("12/10/14 1:00 am").getTime();
addTimePartition(tpfs, time1, "file1");
// verify that we can query the date and timestamp in the file with Hive
runCommand(NAMESPACE_ID, "SELECT id, name, dt, ts FROM " + tableName + " LIMIT 50", true, Lists.newArrayList(new ColumnDesc("id", "INT", 1, null), new ColumnDesc("name", "STRING", 2, null), new ColumnDesc("dt", "DATE", 3, null), new ColumnDesc("ts", "TIMESTAMP", 4, null)), Lists.newArrayList(new QueryResult(Lists.newArrayList(1, "alice", "1970-01-01", "2018-09-07 16:09:50.595"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.emptyList());
// create a file set
datasetFramework.addInstance("timePartitionedFileSet", datasetInstanceId, FileSetProperties.builder().setBasePath("somePath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", dtSchema.toString()).build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.newArrayList(tableName))));
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testAddGetPartitions.
@Test
public void testAddGetPartitions() throws Exception {
final TimePartitionedFileSet fileSet = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
TransactionAware txAwareDataset = (TransactionAware) fileSet;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
// this is an arbitrary data to use as the test time
long time = DATE_FORMAT.parse("12/10/14 5:10 am").getTime();
long time2 = time + HOUR;
String firstPath = "first/partition";
String secondPath = "second/partition";
// make sure the file set has no partitions initially
validateTimePartition(fileSet, time, null);
validateTimePartitions(fileSet, 0L, MAX, Collections.<Long, String>emptyMap());
// add a partition, verify getPartition() works
fileSet.addPartition(time, firstPath);
validateTimePartition(fileSet, time, firstPath);
Map<Long, String> expectNone = Collections.emptyMap();
Map<Long, String> expectFirst = ImmutableMap.of(time, firstPath);
Map<Long, String> expectSecond = ImmutableMap.of(time2, secondPath);
Map<Long, String> expectBoth = ImmutableMap.of(time, firstPath, time2, secondPath);
// verify various ways to list partitions with various ranges
validateTimePartitions(fileSet, time + MINUTE, MAX, expectNone);
validateTimePartitions(fileSet, 0L, time, expectNone);
validateTimePartitions(fileSet, 0L, MAX, expectFirst);
validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
validateTimePartitions(fileSet, 0L, time + MINUTE, expectFirst);
validateTimePartitions(fileSet, 0L, time + HOUR, expectFirst);
validateTimePartitions(fileSet, time - HOUR, time + HOUR, expectFirst);
// add and verify another partition
fileSet.addPartition(time2, secondPath);
validateTimePartition(fileSet, time2, secondPath);
// verify various ways to list partitions with various ranges
validateTimePartitions(fileSet, 0L, MAX, expectBoth);
validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectFirst);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectBoth);
// try to add another partition with the same key
try {
fileSet.addPartition(time2, "third/partition");
Assert.fail("Should have thrown Exception for duplicate partition");
} catch (DataSetException e) {
// expected
}
// remove first partition and validate
fileSet.dropPartition(time);
validateTimePartition(fileSet, time, null);
// verify various ways to list partitions with various ranges
validateTimePartitions(fileSet, 0L, MAX, expectSecond);
validateTimePartitions(fileSet, time, time + 30 * MINUTE, expectNone);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2, expectNone);
validateTimePartitions(fileSet, time + 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
validateTimePartitions(fileSet, time - 30 * MINUTE, time2 + 30 * MINUTE, expectSecond);
// try to delete another partition with the same key
try {
fileSet.dropPartition(time);
} catch (DataSetException e) {
Assert.fail("Should not have have thrown Exception for removing non-existent partition");
}
}
});
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testOutputPartitionPath.
/**
* Tests that the output file path is set correctly, based on the output partition time.
*/
@Test
public void testOutputPartitionPath() throws Exception {
// test specifying output time
Date date = DATE_FORMAT.parse("1/1/15 8:42 pm");
Map<String, String> args = Maps.newHashMap();
TimePartitionedFileSetArguments.setOutputPartitionTime(args, date.getTime());
TimeZone timeZone = Calendar.getInstance().getTimeZone();
TimePartitionedFileSetArguments.setOutputPathFormat(args, "yyyy-MM-dd/HH_mm", timeZone.getID());
TimePartitionedFileSet ds = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
String outputPath = ds.getEmbeddedFileSet().getOutputLocation().toURI().getPath();
Assert.assertTrue(outputPath.endsWith("2015-01-01/20_42"));
Map<String, String> outputConfig = ds.getOutputFormatConfiguration();
Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42"));
// test specifying output time and partition key -> time should prevail
PartitionKey key = PartitionKey.builder().addIntField("year", 2014).addIntField("month", 1).addIntField("day", 1).addIntField("hour", 20).addIntField("minute", 54).build();
TimePartitionedFileSet ds1 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
TimePartitionedFileSetArguments.setOutputPartitionKey(args, key);
outputConfig = ds1.getOutputFormatConfiguration();
Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42"));
args.clear();
TimePartitionedFileSetArguments.setOutputPartitionKey(args, key);
TimePartitionedFileSet ds2 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
outputConfig = ds2.getOutputFormatConfiguration();
Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("54"));
args.clear();
TimePartitionedFileSet ds3 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args);
try {
ds3.getOutputFormatConfiguration();
Assert.fail("getOutputFormatConfiguration should have failed with neither output time nor partition key");
} catch (DataSetException e) {
// expected
}
}
use of io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testInputConfigurationFailure.
private void testInputConfigurationFailure(Map<String, String> arguments, final String why) throws Exception {
final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
TransactionAware txAwareDataset = (TransactionAware) dataset;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
try {
dataset.getInputFormatConfiguration();
Assert.fail("getInputFormatConfiguration should fail " + why);
} catch (Exception e) {
// expected
}
}
});
}
Aggregations