use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class MockRuntimeDatasetSource method writeInput.
/**
* Used to write the input records for the pipeline run. Should be called after the pipeline has been created.
*
* @param tableManager dataset manager used to write to the source dataset
* @param records records that should be the input for the pipeline
*/
public static void writeInput(DataSetManager<Table> tableManager, Iterable<StructuredRecord> records) throws Exception {
tableManager.flush();
Table table = tableManager.get();
// each rowkey will be a UUID.
for (StructuredRecord record : records) {
byte[] row = Bytes.toBytes(UUID.randomUUID());
table.put(row, SCHEMA_COL, Bytes.toBytes(record.getSchema().toString()));
table.put(row, RECORD_COL, Bytes.toBytes(StructuredRecordStringConverter.toJsonString(record)));
}
tableManager.flush();
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class MockExternalSink method readOutput.
/**
* Used to read the records written by this sink.
*
* @param dirName directory where output files are found
*/
public static List<StructuredRecord> readOutput(String dirName) throws Exception {
File dir = new File(dirName);
File[] files = dir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.startsWith("part");
}
});
if (files == null) {
return Collections.emptyList();
}
List<StructuredRecord> records = new ArrayList<>();
for (File file : files) {
records.addAll(Lists.transform(Files.readLines(file, Charsets.UTF_8), new Function<String, StructuredRecord>() {
@Override
public StructuredRecord apply(String input) {
return GSON.fromJson(input, StructuredRecord.class);
}
}));
}
return records;
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class FilterTransform method transform.
@Override
public void transform(StructuredRecord input, Emitter<StructuredRecord> emitter) throws Exception {
StructuredRecord.Builder outputBuilder = getOutputBuilder(input);
String name = input.get(config.name);
URL nameUrl = new URL(url, "name/" + name);
HttpURLConnection connection = (HttpURLConnection) nameUrl.openConnection();
String response;
try {
response = new String(ByteStreams.toByteArray(connection.getInputStream()), Charsets.UTF_8);
// Only emit records for which name is stored in the service
if (response.equalsIgnoreCase(name)) {
emitter.emit(outputBuilder.build());
}
} finally {
connection.disconnect();
}
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class FilterTransform method getOutputBuilder.
private StructuredRecord.Builder getOutputBuilder(StructuredRecord input) {
List<Schema.Field> outFields = new ArrayList<>();
for (Schema.Field field : input.getSchema().getFields()) {
outFields.add(field);
}
Schema outSchema = Schema.recordOf(input.getSchema().getRecordName(), outFields);
// copy all the values
StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outSchema);
for (Schema.Field inField : input.getSchema().getFields()) {
outFields.add(inField);
outputBuilder.set(inField.getName(), input.get(inField.getName()));
}
return outputBuilder;
}
use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.
the class PipelineTest method testWordCountSparkSink.
@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
String inputName = "sparkSinkInput";
String outputName = "sparkSinkOutput";
// create the pipeline config
ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
Map<String, String> sinkProperties = new HashMap<>();
sinkProperties.put("field", "text");
sinkProperties.put("tableName", outputName);
ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
// create the pipeline
ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
// write the input
Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
DataSetManager<Table> inputManager = getDataset(inputName);
List<StructuredRecord> inputRecords = new ArrayList<>();
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
MockSource.writeInput(inputManager, inputRecords);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.start();
workflowManager.waitForFinish(4, TimeUnit.MINUTES);
DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
KeyValueTable output = outputManager.get();
Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
Aggregations