use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class MapReduceWithPartitionedTest method testPartitionedFileSetWithMR.
@Test
public void testPartitionedFileSetWithMR() throws Exception {
final ApplicationWithPrograms app = deployApp(AppWithPartitionedFileSet.class);
// write a value to the input table
final Table table = datasetCache.getDataset(AppWithPartitionedFileSet.INPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.put(Bytes.toBytes("x"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("1"));
}
});
// a partition key for the map/reduce output
final PartitionKey keyX = PartitionKey.builder().addStringField("type", "x").addLongField("time", 150000L).build();
// run the partition writer m/r with this output partition time
Map<String, String> runtimeArguments = Maps.newHashMap();
Map<String, String> outputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyX);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
final PartitionedFileSet dataset = datasetCache.getDataset(PARTITIONED);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = dataset.getPartition(keyX);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertTrue(path.contains("x"));
Assert.assertTrue(path.contains("150000"));
}
});
// delete the data in the input table and write a new row
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) table).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
table.delete(Bytes.toBytes("x"));
table.put(Bytes.toBytes("y"), AppWithPartitionedFileSet.ONLY_COLUMN, Bytes.toBytes("2"));
}
});
// a new partition key for the next map/reduce
final PartitionKey keyY = PartitionKey.builder().addStringField("type", "y").addLongField("time", 200000L).build();
// now run the m/r again with a new partition time, say 5 minutes later
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, keyY);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, outputArgs));
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionWriter.class, new BasicArguments(runtimeArguments)));
// this should have created a partition in the tpfs
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Partition partition = dataset.getPartition(keyY);
Assert.assertNotNull(partition);
String path = partition.getRelativePath();
Assert.assertNotNull(path);
Assert.assertTrue(path.contains("y"));
Assert.assertTrue(path.contains("200000"));
}
});
// a partition filter that matches the outputs of both map/reduces
PartitionFilter filterXY = PartitionFilter.builder().addRangeCondition("type", "x", "z").build();
// now run a map/reduce that reads all the partitions
runtimeArguments = Maps.newHashMap();
Map<String, String> inputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterXY);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "a");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read both partitions - and written both x and y to row a
final Table output = datasetCache.getDataset(AppWithPartitionedFileSet.OUTPUT);
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("a"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertEquals("2", row.getString("y"));
}
});
// a partition filter that matches the output key of the first map/reduce
PartitionFilter filterX = PartitionFilter.builder().addValueCondition("type", "x").addRangeCondition("time", null, 160000L).build();
// now run a map/reduce that reads a range of the partitions, namely the first one
inputArgs.clear();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterX);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "b");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read the first partition only - and written only x to row b
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("b"));
Assert.assertEquals("1", row.getString("x"));
Assert.assertNull(row.get("y"));
}
});
// a partition filter that matches no key
PartitionFilter filterMT = PartitionFilter.builder().addValueCondition("type", "nosuchthing").build();
// now run a map/reduce that reads an empty range of partitions (the filter matches nothing)
inputArgs.clear();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, filterMT);
runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, PARTITIONED, inputArgs));
runtimeArguments.put(AppWithPartitionedFileSet.ROW_TO_WRITE, "n");
Assert.assertTrue(runProgram(app, AppWithPartitionedFileSet.PartitionReader.class, new BasicArguments(runtimeArguments)));
// this should have read no partitions - and written nothing to row n
Transactions.createTransactionExecutor(txExecutorFactory, (TransactionAware) output).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() {
Row row = output.get(Bytes.toBytes("n"));
Assert.assertTrue(row.isEmpty());
}
});
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedExisting.
private void testPartitionedExisting(String reuseProperty, boolean possessed) throws Exception {
final DatasetId dummyInstanceId = NAMESPACE_ID.dataset("dummy");
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset("tpExisting");
File path = new File(tmpFolder.newFolder(), "base");
String tableName = "reuse";
// create a PFS in order to create a table in Hive and add a partition
// create a time partitioned file set
DatasetProperties props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("key STRING, value INT").setExploreFormat("csv").build();
datasetFramework.addInstance(PartitionedFileSet.class.getName(), dummyInstanceId, props);
PartitionedFileSet dummy = datasetFramework.getDataset(dummyInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(dummy);
Location location = dummy.getEmbeddedFileSet().getLocation("number1").append("file1");
PartitionKey key = PartitionKey.builder().addIntField("number", 1).build();
FileWriterHelper.generateTextFile(location.getOutputStream(), ",", "x", 1, 2);
addPartition(dummy, key, "number1");
// validate data
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("key STRING, value INT").setExploreFormat("csv").add(reuseProperty, "true").build();
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetInstanceId, props);
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(path.toString()).setEnableExploreOnCreate(true).setExploreTableName(tableName).setExploreSchema("k STRING, v INT").setExploreFormat("csv").add(reuseProperty, "true").build();
datasetFramework.updateInstance(datasetInstanceId, props);
// validate data
if (!possessed) {
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
} else {
List<ColumnDesc> newExpectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".k", "STRING", 1, null), new ColumnDesc(tableName + ".v", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, newExpectedColumns, null);
}
datasetFramework.deleteInstance(datasetInstanceId);
if (!possessed) {
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName, true, expectedColumns, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
} else {
runCommand(NAMESPACE_ID, "SHOW tables", false, null, Collections.<QueryResult>emptyList());
}
datasetFramework.deleteInstance(dummyInstanceId);
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedFileSet.
private void testPartitionedFileSet(@Nullable String dbName, @Nullable String tableName) throws Exception {
DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parted");
String hiveTableName = getDatasetHiveName(datasetInstanceId);
String showTablesCommand = "show tables";
FileSetProperties.Builder props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addStringField("str").addIntField("num").build()).setBasePath("parted").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString());
if (tableName != null) {
props.setExploreTableName(tableName);
hiveTableName = tableName;
}
String queryTableName = hiveTableName;
if (dbName != null) {
props.setExploreDatabaseName(dbName);
runCommand(NAMESPACE_ID, "create database " + dbName, false, null, null);
showTablesCommand += " in " + dbName;
queryTableName = dbName + "." + queryTableName;
}
// create a time partitioned file set
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, props.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// Accessing dataset instance to perform data operations
final PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
Location locationX1 = fileSet.getLocation("fileX1/nn");
Location locationY1 = fileSet.getLocation("fileY1/nn");
Location locationX2 = fileSet.getLocation("fileX2/nn");
Location locationY2 = fileSet.getLocation("fileY2/nn");
FileWriterHelper.generateAvroFile(locationX1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(locationY1.getOutputStream(), "y", 1, 2);
FileWriterHelper.generateAvroFile(locationX2.getOutputStream(), "x", 2, 3);
FileWriterHelper.generateAvroFile(locationY2.getOutputStream(), "y", 2, 3);
final PartitionKey keyX1 = PartitionKey.builder().addStringField("str", "x").addIntField("num", 1).build();
PartitionKey keyY1 = PartitionKey.builder().addStringField("str", "y").addIntField("num", 1).build();
final PartitionKey keyX2 = PartitionKey.builder().addStringField("str", "x").addIntField("num", 2).build();
PartitionKey keyY2 = PartitionKey.builder().addStringField("str", "y").addIntField("num", 2).build();
addPartition(partitioned, keyX1, "fileX1");
addPartition(partitioned, keyY1, "fileY1");
addPartition(partitioned, keyX2, "fileX2");
addPartition(partitioned, keyY2, "fileY2");
// verify that the partitions were added to Hive
validatePartitions(queryTableName, partitioned, ImmutableList.of(keyX1, keyX2, keyY1, keyY2));
// verify that count() and where... work in Hive
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(4L))));
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName + " WHERE num = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null), new ColumnDesc(hiveTableName + ".value", "STRING", 2, null), new ColumnDesc(hiveTableName + ".str", "STRING", 3, null), new ColumnDesc(hiveTableName + ".num", "INT", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x2", "#2", "x", 2)), new QueryResult(Lists.<Object>newArrayList("y2", "#2", "y", 2))));
// drop a partition and query again
dropPartition(partitioned, keyX2);
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// attempt a transaction that drops one partition, adds another, and then fails
try {
doTransaction(partitioned, new Runnable() {
@Override
public void run() {
partitioned.dropPartition(keyX1);
partitioned.addPartition(keyX2, "fileX2");
Assert.fail("fail tx");
}
});
} catch (TransactionFailureException e) {
// expected
}
// validate that both the drop and addPartition were undone
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// attempt a transaction that attempts to add an existing partition, hence fails
try {
doTransaction(partitioned, new Runnable() {
@Override
public void run() {
partitioned.addPartition(keyX1, "fileX1");
throw new RuntimeException("on purpose");
}
});
} catch (TransactionFailureException e) {
// expected if the cause is not "on purpose"
Assert.assertTrue(e.getCause() instanceof DataSetException);
}
// validate that both the drop and addPartition were undone
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// drop a partition directly from hive
runCommand(NAMESPACE_ID, "ALTER TABLE " + queryTableName + " DROP PARTITION (str='y', num=2)", false, null, null);
// verify that one more value is gone now, namely y2, in Hive, but the PFS still has it
validatePartitionsInHive(queryTableName, ImmutableSet.of(keyX1, keyY1));
validatePartitionsInPFS(partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// make sure the partition can still be dropped from the PFS dataset
dropPartition(partitioned, keyY2);
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1));
// change the explore schema by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
// valudate the schema was updated
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1), true);
// disable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(false).build());
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// re-enable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(true).build());
// verify the Hive table is back
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, null, Collections.<QueryResult>emptyList());
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextFile.
// this tests mainly the support for different text formats. Other features (partitioning etc.) are tested above.
private void testPartitionedTextFile(String name, String format, String delim, String fileDelim) throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset(name);
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
PartitionedFileSetProperties.Builder builder = (PartitionedFileSetProperties.Builder) PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(name).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value INT").setExploreFormat(format);
if (delim != null) {
builder.setExploreFormatProperty("delimiter", delim);
}
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, builder.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateTextFile(location1.getOutputStream(), fileDelim, "x", 1, 2);
PartitionKey key1 = PartitionKey.builder().addIntField("number", 1).build();
addPartition(partitioned, key1, "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
// drop a partition and query again
dropPartition(partitioned, key1);
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
use of co.cask.cdap.api.dataset.lib.PartitionKey in project cdap by caskdata.
the class DataCleansingMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
partitionCommitter = PartitionBatchInput.setInput(context, DataCleansing.RAW_RECORDS, new KVTableStatePersistor(DataCleansing.CONSUMING_STATE, "state.key"));
// Each run writes its output to a partition for the league
Long timeKey = Long.valueOf(context.getRuntimeArguments().get(OUTPUT_PARTITION_KEY));
PartitionKey outputKey = PartitionKey.builder().addLongField("time", timeKey).build();
Map<String, String> metadataToAssign = ImmutableMap.of("source.program", "DataCleansingMapReduce");
// set up two outputs - one for invalid records and one for valid records
Map<String, String> invalidRecordsArgs = new HashMap<>();
PartitionedFileSetArguments.setOutputPartitionKey(invalidRecordsArgs, outputKey);
PartitionedFileSetArguments.setOutputPartitionMetadata(invalidRecordsArgs, metadataToAssign);
context.addOutput(Output.ofDataset(DataCleansing.INVALID_RECORDS, invalidRecordsArgs));
Map<String, String> cleanRecordsArgs = new HashMap<>();
PartitionedFileSetArguments.setDynamicPartitioner(cleanRecordsArgs, TimeAndZipPartitioner.class);
PartitionedFileSetArguments.setOutputPartitionMetadata(cleanRecordsArgs, metadataToAssign);
context.addOutput(Output.ofDataset(DataCleansing.CLEAN_RECORDS, cleanRecordsArgs));
Job job = context.getHadoopJob();
job.setMapperClass(SchemaMatchingFilter.class);
job.setNumReduceTasks(0);
// simply propagate the schema (if any) to be used by the mapper
String schemaJson = context.getRuntimeArguments().get(SCHEMA_KEY);
if (schemaJson != null) {
job.getConfiguration().set(SCHEMA_KEY, schemaJson);
}
}
Aggregations