Search in sources :

Example 61 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockRuntimeDatasetSource method writeInput.

/**
   * Used to write the input records for the pipeline run. Should be called after the pipeline has been created.
   *
   * @param tableManager dataset manager used to write to the source dataset
   * @param records records that should be the input for the pipeline
   */
public static void writeInput(DataSetManager<Table> tableManager, Iterable<StructuredRecord> records) throws Exception {
    tableManager.flush();
    Table table = tableManager.get();
    // each rowkey will be a UUID.
    for (StructuredRecord record : records) {
        byte[] row = Bytes.toBytes(UUID.randomUUID());
        table.put(row, SCHEMA_COL, Bytes.toBytes(record.getSchema().toString()));
        table.put(row, RECORD_COL, Bytes.toBytes(StructuredRecordStringConverter.toJsonString(record)));
    }
    tableManager.flush();
}
Also used : Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 62 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockExternalSink method readOutput.

/**
   * Used to read the records written by this sink.
   *
   * @param dirName directory where output files are found
   */
public static List<StructuredRecord> readOutput(String dirName) throws Exception {
    File dir = new File(dirName);
    File[] files = dir.listFiles(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
            return name.startsWith("part");
        }
    });
    if (files == null) {
        return Collections.emptyList();
    }
    List<StructuredRecord> records = new ArrayList<>();
    for (File file : files) {
        records.addAll(Lists.transform(Files.readLines(file, Charsets.UTF_8), new Function<String, StructuredRecord>() {

            @Override
            public StructuredRecord apply(String input) {
                return GSON.fromJson(input, StructuredRecord.class);
            }
        }));
    }
    return records;
}
Also used : FilenameFilter(java.io.FilenameFilter) Function(com.google.common.base.Function) ArrayList(java.util.ArrayList) File(java.io.File) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 63 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class FilterTransform method transform.

@Override
public void transform(StructuredRecord input, Emitter<StructuredRecord> emitter) throws Exception {
    StructuredRecord.Builder outputBuilder = getOutputBuilder(input);
    String name = input.get(config.name);
    URL nameUrl = new URL(url, "name/" + name);
    HttpURLConnection connection = (HttpURLConnection) nameUrl.openConnection();
    String response;
    try {
        response = new String(ByteStreams.toByteArray(connection.getInputStream()), Charsets.UTF_8);
        // Only emit records for which name is stored in the service
        if (response.equalsIgnoreCase(name)) {
            emitter.emit(outputBuilder.build());
        }
    } finally {
        connection.disconnect();
    }
}
Also used : HttpURLConnection(java.net.HttpURLConnection) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) URL(java.net.URL)

Example 64 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class FilterTransform method getOutputBuilder.

private StructuredRecord.Builder getOutputBuilder(StructuredRecord input) {
    List<Schema.Field> outFields = new ArrayList<>();
    for (Schema.Field field : input.getSchema().getFields()) {
        outFields.add(field);
    }
    Schema outSchema = Schema.recordOf(input.getSchema().getRecordName(), outFields);
    // copy all the values
    StructuredRecord.Builder outputBuilder = StructuredRecord.builder(outSchema);
    for (Schema.Field inField : input.getSchema().getFields()) {
        outFields.add(inField);
        outputBuilder.set(inField.getName(), input.get(inField.getName()));
    }
    return outputBuilder;
}
Also used : PluginPropertyField(co.cask.cdap.api.plugin.PluginPropertyField) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord)

Example 65 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class PipelineTest method testWordCountSparkSink.

@SuppressWarnings("ConstantConditions")
@Test
public void testWordCountSparkSink() throws Exception {
    String inputName = "sparkSinkInput";
    String outputName = "sparkSinkOutput";
    // create the pipeline config
    ETLStage source = new ETLStage("source", MockSource.getPlugin(inputName));
    Map<String, String> sinkProperties = new HashMap<>();
    sinkProperties.put("field", "text");
    sinkProperties.put("tableName", outputName);
    ETLStage sink = new ETLStage("sink", new ETLPlugin(WordCountSink.NAME, SparkSink.PLUGIN_TYPE, sinkProperties, null));
    ETLBatchConfig pipelineConfig = ETLBatchConfig.builder("* * * * *").addStage(source).addStage(sink).addConnection(source.getName(), sink.getName()).build();
    // create the pipeline
    ApplicationId pipelineId = NamespaceId.DEFAULT.app("sparkSinkTestPipeline");
    ApplicationManager appManager = deployApplication(pipelineId, new AppRequest<>(APP_ARTIFACT, pipelineConfig));
    // write the input
    Schema inputSchema = Schema.recordOf("text", Schema.Field.of("text", Schema.of(Schema.Type.STRING)));
    DataSetManager<Table> inputManager = getDataset(inputName);
    List<StructuredRecord> inputRecords = new ArrayList<>();
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello World").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Hal").build());
    inputRecords.add(StructuredRecord.builder(inputSchema).set("text", "Hello my name is Sam").build());
    MockSource.writeInput(inputManager, inputRecords);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.start();
    workflowManager.waitForFinish(4, TimeUnit.MINUTES);
    DataSetManager<KeyValueTable> outputManager = getDataset(outputName);
    KeyValueTable output = outputManager.get();
    Assert.assertEquals(3L, Bytes.toLong(output.read("Hello")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("World")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("my")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("name")));
    Assert.assertEquals(2L, Bytes.toLong(output.read("is")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Hal")));
    Assert.assertEquals(1L, Bytes.toLong(output.read("Sam")));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) HashMap(java.util.HashMap) Schema(co.cask.cdap.api.data.schema.Schema) WorkflowManager(co.cask.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Aggregations

StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)97 Schema (co.cask.cdap.api.data.schema.Schema)71 Test (org.junit.Test)51 Table (co.cask.cdap.api.dataset.table.Table)36 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)36 ApplicationId (co.cask.cdap.proto.id.ApplicationId)36 ApplicationManager (co.cask.cdap.test.ApplicationManager)33 AppRequest (co.cask.cdap.proto.artifact.AppRequest)31 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)25 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)25 WorkflowManager (co.cask.cdap.test.WorkflowManager)23 ArrayList (java.util.ArrayList)20 StreamEvent (co.cask.cdap.api.flow.flowlet.StreamEvent)19 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)18 HashSet (java.util.HashSet)10 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)8 File (java.io.File)8 TimeoutException (java.util.concurrent.TimeoutException)8 Put (co.cask.cdap.api.dataset.table.Put)7 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)7