use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedTextFile.
// this tests mainly the support for different text formats. Other features (partitioning etc.) are tested above.
private void testPartitionedTextFile(String name, String format, String delim, String fileDelim) throws Exception {
final DatasetId datasetInstanceId = NAMESPACE_ID.dataset(name);
final String tableName = getDatasetHiveName(datasetInstanceId);
// create a time partitioned file set
PartitionedFileSetProperties.Builder builder = (PartitionedFileSetProperties.Builder) PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setBasePath(name).setEnableExploreOnCreate(true).setExploreSchema("key STRING, value INT").setExploreFormat(format);
if (delim != null) {
builder.setExploreFormatProperty("delimiter", delim);
}
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, builder.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, "show tables", true, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(tableName))));
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partitions. Beware that Hive expects a partition to be a directory, so we create a dir with one file
Location location1 = fileSet.getLocation("file1/nn");
FileWriterHelper.generateTextFile(location1.getOutputStream(), fileDelim, "x", 1, 2);
PartitionKey key1 = PartitionKey.builder().addIntField("number", 1).build();
addPartition(partitioned, key1, "file1");
// verify that the partitions were added to Hive
runCommand(NAMESPACE_ID, "show partitions " + tableName, true, Lists.newArrayList(new ColumnDesc("partition", "STRING", 1, "from deserializer")), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("number=1"))));
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "INT", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x1", 1, 1))));
// drop a partition and query again
dropPartition(partitioned, key1);
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, Lists.newArrayList(new ColumnDesc("tab_name", "STRING", 1, "from deserializer")), Collections.<QueryResult>emptyList());
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testCreateAddAlterDrop.
private void testCreateAddAlterDrop(@Nullable String dbName, @Nullable String tableName) throws Exception {
DatasetId datasetInstanceId = NAMESPACE_ID.dataset("files");
String hiveTableName = getDatasetHiveName(datasetInstanceId);
String showTablesCommand = "show tables";
FileSetProperties.Builder props = FileSetProperties.builder().setBasePath("myPath").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString());
if (tableName != null) {
props.setExploreTableName(tableName);
hiveTableName = tableName;
}
String queryTableName = hiveTableName;
if (dbName != null) {
props.setExploreDatabaseName(dbName);
runCommand(NAMESPACE_ID, "create database " + dbName, false, null, null);
showTablesCommand += " in " + dbName;
queryTableName = dbName + "." + queryTableName;
}
// create a time partitioned file set
datasetFramework.addInstance("fileSet", datasetInstanceId, props.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// Accessing dataset instance to perform data operations
FileSet fileSet = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(fileSet);
// add a file
FileWriterHelper.generateAvroFile(fileSet.getLocation("file1").getOutputStream(), "a", 0, 3);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null), new ColumnDesc(hiveTableName + ".value", "STRING", 2, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("a0", "#0")), new QueryResult(Lists.<Object>newArrayList("a1", "#1")), new QueryResult(Lists.<Object>newArrayList("a2", "#2"))));
// add another file
FileWriterHelper.generateAvroFile(fileSet.getLocation("file2").getOutputStream(), "b", 3, 5);
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(5L))));
// disable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(false).build());
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// re-enable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(true).build());
// verify that we can query again
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(5L))));
// change the explore schema by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
// verify that we can query the key-values in the file with Hive
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName + " ORDER BY key", true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("a0")), new QueryResult(Lists.<Object>newArrayList("a1")), new QueryResult(Lists.<Object>newArrayList("a2")), new QueryResult(Lists.<Object>newArrayList("b3")), new QueryResult(Lists.<Object>newArrayList("b4"))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// drop the database if needed
if (dbName != null) {
runCommand(NAMESPACE_ID, "drop database " + dbName, false, null, null);
}
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedFileSet.
private void testPartitionedFileSet(@Nullable String dbName, @Nullable String tableName) throws Exception {
DatasetId datasetInstanceId = NAMESPACE_ID.dataset("parted");
String hiveTableName = getDatasetHiveName(datasetInstanceId);
String showTablesCommand = "show tables";
FileSetProperties.Builder props = PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addStringField("str").addIntField("num").build()).setBasePath("parted").setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString());
if (tableName != null) {
props.setExploreTableName(tableName);
hiveTableName = tableName;
}
String queryTableName = hiveTableName;
if (dbName != null) {
props.setExploreDatabaseName(dbName);
runCommand(NAMESPACE_ID, "create database " + dbName, false, null, null);
showTablesCommand += " in " + dbName;
queryTableName = dbName + "." + queryTableName;
}
// create a time partitioned file set
datasetFramework.addInstance("partitionedFileSet", datasetInstanceId, props.build());
// verify that the hive table was created for this file set
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// Accessing dataset instance to perform data operations
final PartitionedFileSet partitioned = datasetFramework.getDataset(datasetInstanceId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add some partitions. Beware that Hive expects a partition to be a directory, so we create dirs with one file
Location locationX1 = fileSet.getLocation("fileX1/nn");
Location locationY1 = fileSet.getLocation("fileY1/nn");
Location locationX2 = fileSet.getLocation("fileX2/nn");
Location locationY2 = fileSet.getLocation("fileY2/nn");
FileWriterHelper.generateAvroFile(locationX1.getOutputStream(), "x", 1, 2);
FileWriterHelper.generateAvroFile(locationY1.getOutputStream(), "y", 1, 2);
FileWriterHelper.generateAvroFile(locationX2.getOutputStream(), "x", 2, 3);
FileWriterHelper.generateAvroFile(locationY2.getOutputStream(), "y", 2, 3);
final PartitionKey keyX1 = PartitionKey.builder().addStringField("str", "x").addIntField("num", 1).build();
PartitionKey keyY1 = PartitionKey.builder().addStringField("str", "y").addIntField("num", 1).build();
final PartitionKey keyX2 = PartitionKey.builder().addStringField("str", "x").addIntField("num", 2).build();
PartitionKey keyY2 = PartitionKey.builder().addStringField("str", "y").addIntField("num", 2).build();
addPartition(partitioned, keyX1, "fileX1");
addPartition(partitioned, keyY1, "fileY1");
addPartition(partitioned, keyX2, "fileX2");
addPartition(partitioned, keyY2, "fileY2");
// verify that the partitions were added to Hive
validatePartitions(queryTableName, partitioned, ImmutableList.of(keyX1, keyX2, keyY1, keyY2));
// verify that count() and where... work in Hive
runCommand(NAMESPACE_ID, "SELECT count(*) AS count FROM " + queryTableName, true, Lists.newArrayList(new ColumnDesc("count", "BIGINT", 1, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(4L))));
runCommand(NAMESPACE_ID, "SELECT * FROM " + queryTableName + " WHERE num = 2 ORDER BY key, value", true, Lists.newArrayList(new ColumnDesc(hiveTableName + ".key", "STRING", 1, null), new ColumnDesc(hiveTableName + ".value", "STRING", 2, null), new ColumnDesc(hiveTableName + ".str", "STRING", 3, null), new ColumnDesc(hiveTableName + ".num", "INT", 4, null)), Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList("x2", "#2", "x", 2)), new QueryResult(Lists.<Object>newArrayList("y2", "#2", "y", 2))));
// drop a partition and query again
dropPartition(partitioned, keyX2);
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// attempt a transaction that drops one partition, adds another, and then fails
try {
doTransaction(partitioned, new Runnable() {
@Override
public void run() {
partitioned.dropPartition(keyX1);
partitioned.addPartition(keyX2, "fileX2");
Assert.fail("fail tx");
}
});
} catch (TransactionFailureException e) {
// expected
}
// validate that both the drop and addPartition were undone
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// attempt a transaction that attempts to add an existing partition, hence fails
try {
doTransaction(partitioned, new Runnable() {
@Override
public void run() {
partitioned.addPartition(keyX1, "fileX1");
throw new RuntimeException("on purpose");
}
});
} catch (TransactionFailureException e) {
// expected if the cause is not "on purpose"
Assert.assertTrue(e.getCause() instanceof DataSetException);
}
// validate that both the drop and addPartition were undone
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// drop a partition directly from hive
runCommand(NAMESPACE_ID, "ALTER TABLE " + queryTableName + " DROP PARTITION (str='y', num=2)", false, null, null);
// verify that one more value is gone now, namely y2, in Hive, but the PFS still has it
validatePartitionsInHive(queryTableName, ImmutableSet.of(keyX1, keyY1));
validatePartitionsInPFS(partitioned, ImmutableSet.of(keyX1, keyY1, keyY2));
// make sure the partition can still be dropped from the PFS dataset
dropPartition(partitioned, keyY2);
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1));
// change the explore schema by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
// valudate the schema was updated
validatePartitions(queryTableName, partitioned, ImmutableSet.of(keyX1, keyY1), true);
// disable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(false).build());
// verify the Hive table is gone
runCommand(NAMESPACE_ID, showTablesCommand, false, null, Collections.<QueryResult>emptyList());
// re-enable explore by updating the props
datasetFramework.updateInstance(datasetInstanceId, props.setEnableExploreOnCreate(true).build());
// verify the Hive table is back
runCommand(NAMESPACE_ID, showTablesCommand, true, null, Lists.newArrayList(new QueryResult(Lists.<Object>newArrayList(hiveTableName))));
// drop the dataset
datasetFramework.deleteInstance(datasetInstanceId);
// verify the Hive table is gone
runCommand(NAMESPACE_ID, "show tables", false, null, Collections.<QueryResult>emptyList());
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedAvroSchemaUpdate.
@Test
public void testPartitionedAvroSchemaUpdate() throws Exception {
final DatasetId datasetId = NAMESPACE_ID.dataset("avroupd");
final String tableName = getDatasetHiveName(datasetId);
// create a time partitioned file set
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partition
Location location4 = fileSet.getLocation("file4/nn");
FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
// new partition should have new format, validate with query
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
// update the partitioned file set
datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".number", "INT", 2, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
new QueryResult(Lists.<Object>newArrayList("x4", 4))));
}
use of io.cdap.cdap.api.dataset.lib.FileSet in project cdap by caskdata.
the class DataPipelineTest method testSinglePhaseWithSparkSink.
private void testSinglePhaseWithSparkSink() throws Exception {
/*
* source1 ---|
* |--> sparksink
* source2 ---|
*/
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(new ETLStage("source1", MockSource.getPlugin("messages1", SpamMessage.SCHEMA))).addStage(new ETLStage("source2", MockSource.getPlugin("messages2", SpamMessage.SCHEMA))).addStage(new ETLStage("customsink", new ETLPlugin(NaiveBayesTrainer.PLUGIN_NAME, SparkSink.PLUGIN_TYPE, ImmutableMap.of("fileSetName", "modelFileSet", "path", "output", "fieldToClassify", SpamMessage.TEXT_FIELD, "predictionField", SpamMessage.SPAM_PREDICTION_FIELD), null))).addConnection("source1", "customsink").addConnection("source2", "customsink").build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("SparkSinkApp");
ApplicationManager appManager = deployApplication(appId, appRequest);
// set up five spam messages and five non-spam messages to be used for classification
List<StructuredRecord> messagesToWrite = new ArrayList<>();
messagesToWrite.add(new SpamMessage("buy our clothes", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("sell your used books to us", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("earn money for free", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is definitely not spam", 1.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("you won the lottery", 1.0).toStructuredRecord());
// write records to source1
DataSetManager<Table> inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages1"));
MockSource.writeInput(inputManager, messagesToWrite);
messagesToWrite.clear();
messagesToWrite.add(new SpamMessage("how was your day", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("what are you up to", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is a genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("this is an even more genuine message", 0.0).toStructuredRecord());
messagesToWrite.add(new SpamMessage("could you send me the report", 0.0).toStructuredRecord());
// write records to source2
inputManager = getDataset(NamespaceId.DEFAULT.dataset("messages2"));
MockSource.writeInput(inputManager, messagesToWrite);
// ingest in some messages to be classified
DataSetManager<FileSet> fileSetManager = getDataset(NaiveBayesTrainer.TEXTS_TO_CLASSIFY);
FileSet fileSet = fileSetManager.get();
try (PrintStream out = new PrintStream(fileSet.getLocation("inputTexts").getOutputStream(), true, "UTF-8")) {
out.println("how are you doing today");
out.println("free money money");
out.println("what are you doing today");
out.println("genuine report");
}
// manually trigger the pipeline
Map<String, String> runtimeArgs = new HashMap<>();
FileSetArguments.setInputPath(runtimeArgs, "inputTexts");
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start(runtimeArgs);
workflowManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> classifiedTexts = getDataset(NaiveBayesTrainer.CLASSIFIED_TEXTS);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("how are you doing today")), 0.01d);
// only 'free money money' should be predicated as spam
Assert.assertEquals(1.0d, Bytes.toDouble(classifiedTexts.get().read("free money money")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("what are you doing today")), 0.01d);
Assert.assertEquals(0.0d, Bytes.toDouble(classifiedTexts.get().read("genuine report")), 0.01d);
validateMetric(5, appId, "source1.records.out");
validateMetric(5, appId, "source2.records.out");
validateMetric(10, appId, "customsink.records.in");
}
Aggregations