Examples with StructuredRecord - io.cdap.cdap.api.data.format.StructuredRecord

Example 91 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class DataStreamsTest method testAlertPublisher.

@Test
public void testAlertPublisher() throws Exception {
    String sinkName = "alertSink";
    String topic = "alertTopic";
    Schema schema = Schema.recordOf("x", Schema.Field.of("id", Schema.nullableOf(Schema.of(Schema.Type.LONG))));
    StructuredRecord record1 = StructuredRecord.builder(schema).set("id", 1L).build();
    StructuredRecord record2 = StructuredRecord.builder(schema).set("id", 2L).build();
    StructuredRecord alertRecord = StructuredRecord.builder(schema).build();
    /*
     * source --> nullAlert --> sink
     *               |
     *               |--> TMS publisher
     */
    DataStreamsConfig config = DataStreamsConfig.builder().setBatchInterval("5s").addStage(new ETLStage("source", MockSource.getPlugin(schema, ImmutableList.of(record1, record2, alertRecord)))).addStage(new ETLStage("nullAlert", NullAlertTransform.getPlugin("id"))).addStage(new ETLStage("sink", MockSink.getPlugin(sinkName))).addStage(new ETLStage("tms", TMSAlertPublisher.getPlugin(topic, NamespaceId.DEFAULT.getNamespace()))).addConnection("source", "nullAlert").addConnection("nullAlert", "sink").addConnection("nullAlert", "tms").setCheckpointDir(checkpointDir).build();
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, config);
    ApplicationId appId = NamespaceId.DEFAULT.app("AlertTest");
    ApplicationManager appManager = deployApplication(appId, appRequest);
    SparkManager sparkManager = appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
    sparkManager.start();
    sparkManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    final Set<StructuredRecord> expectedRecords = ImmutableSet.of(record1, record2);
    final Set<Alert> expectedMessages = ImmutableSet.of(new Alert("nullAlert", new HashMap<String, String>()));
    final DataSetManager<Table> sinkTable = getDataset(sinkName);
    Tasks.waitFor(true, () -> {
        // get alerts from TMS
        try {
            getMessagingAdmin(NamespaceId.DEFAULT.getNamespace()).getTopicProperties(topic);
        } catch (TopicNotFoundException e) {
            return false;
        }
        MessageFetcher messageFetcher = getMessagingContext().getMessageFetcher();
        Set<Alert> actualMessages = new HashSet<>();
        try (CloseableIterator<Message> iter = messageFetcher.fetch(NamespaceId.DEFAULT.getNamespace(), topic, 5, 0)) {
            while (iter.hasNext()) {
                Message message = iter.next();
                Alert alert = message.decodePayload(r -> GSON.fromJson(r, Alert.class));
                actualMessages.add(alert);
            }
        }
        // get records from sink
        sinkTable.flush();
        Set<StructuredRecord> outputRecords = new HashSet<>(MockSink.readOutput(sinkTable));
        return expectedRecords.equals(outputRecords) && expectedMessages.equals(actualMessages);
    }, 4, TimeUnit.MINUTES);
    sparkManager.stop();
    sparkManager.waitForStopped(10, TimeUnit.SECONDS);
    validateMetric(appId, "source.records.out", 3);
    validateMetric(appId, "nullAlert.records.in", 3);
    validateMetric(appId, "nullAlert.records.out", 2);
    validateMetric(appId, "nullAlert.records.alert", 1);
    validateMetric(appId, "sink.records.in", 2);
    validateMetric(appId, "tms.records.in", 1);
}

Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) MessageFetcher(io.cdap.cdap.api.messaging.MessageFetcher) SparkManager(io.cdap.cdap.test.SparkManager) Table(io.cdap.cdap.api.dataset.table.Table) Message(io.cdap.cdap.api.messaging.Message) HashMap(java.util.HashMap) TopicNotFoundException(io.cdap.cdap.api.messaging.TopicNotFoundException) Schema(io.cdap.cdap.api.data.schema.Schema) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) Alert(io.cdap.cdap.etl.api.Alert) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 92 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockRuntimeDatasetSink method readOutput.

/**
 * Used to read the records written by this sink.
 *
 * @param tableManager dataset manager used to get the sink dataset to read from
 */
public static List<StructuredRecord> readOutput(DataSetManager<Table> tableManager) throws Exception {
    Table table = tableManager.get();
    try (Scanner scanner = table.scan(null, null)) {
        List<StructuredRecord> records = new ArrayList<>();
        Row row;
        while ((row = scanner.next()) != null) {
            Schema schema = Schema.parseJson(row.getString(SCHEMA_COL));
            String recordStr = row.getString(RECORD_COL);
            records.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
        }
        return records;
    }
}

Also used : Scanner(io.cdap.cdap.api.dataset.table.Scanner) KeyValueTable(io.cdap.cdap.api.dataset.lib.KeyValueTable) Table(io.cdap.cdap.api.dataset.table.Table) Schema(io.cdap.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Row(io.cdap.cdap.api.dataset.table.Row) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Example 93 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockSQLEngine method writeInput.

/**
 * Used to write the input records for the pipeline run. Should be called after the pipeline has been created.
 *
 * @param fileName file to write the records into
 * @param records records that should be the input for the pipeline
 */
public static void writeInput(String fileName, Iterable<StructuredRecord> records) throws Exception {
    Function<StructuredRecord, String> mapper = input -> {
        try {
            return StructuredRecordStringConverter.toJsonString(input);
        } catch (IOException e) {
            throw new RuntimeException("Unable to set up file for test.", e);
        }
    };
    String output = Joiner.on("\n").join(Iterables.transform(records, mapper));
    Files.write(output, new File(fileName), Charsets.UTF_8);
}

Also used : Iterables(com.google.common.collect.Iterables) BatchSQLEngine(io.cdap.cdap.etl.api.engine.sql.BatchSQLEngine) SQLPullRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPullRequest) SQLPushRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPushRequest) HashMap(java.util.HashMap) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) PluginConfig(io.cdap.cdap.api.plugin.PluginConfig) Files(com.google.common.io.Files) Name(io.cdap.cdap.api.annotation.Name) Gson(com.google.gson.Gson) Map(java.util.Map) StructuredRecordStringConverter(io.cdap.cdap.format.StructuredRecordStringConverter) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) SQLPullDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLPullDataset) Charsets(com.google.common.base.Charsets) SQLJoinDefinition(io.cdap.cdap.etl.api.engine.sql.request.SQLJoinDefinition) Function(com.google.common.base.Function) SQLEngineException(io.cdap.cdap.etl.api.engine.sql.SQLEngineException) SQLJoinRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLJoinRequest) Collection(java.util.Collection) IOException(java.io.IOException) Plugin(io.cdap.cdap.api.annotation.Plugin) Schema(io.cdap.cdap.api.data.schema.Schema) PluginClass(io.cdap.cdap.api.plugin.PluginClass) File(java.io.File) Serializable(java.io.Serializable) SQLPushDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLPushDataset) RuntimeContext(io.cdap.cdap.api.RuntimeContext) SQLDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) PluginPropertyField(io.cdap.cdap.api.plugin.PluginPropertyField) Joiner(com.google.common.base.Joiner) IOException(java.io.IOException) File(java.io.File) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Example 94 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockSQLEngineWithStageSettings method writeInput.

/**
 * Used to write the input records for the pipeline run. Should be called after the pipeline has been created.
 *
 * @param fileName file to write the records into
 * @param records  records that should be the input for the pipeline
 */
public static void writeInput(String fileName, Iterable<StructuredRecord> records) throws Exception {
    Function<StructuredRecord, String> mapper = input -> {
        try {
            return StructuredRecordStringConverter.toJsonString(input);
        } catch (IOException e) {
            throw new RuntimeException("Unable to set up file for test.", e);
        }
    };
    String output = Joiner.on("\n").join(Iterables.transform(records, mapper));
    Files.write(output, new File(fileName), Charsets.UTF_8);
}

Also used : Iterables(com.google.common.collect.Iterables) BatchSQLEngine(io.cdap.cdap.etl.api.engine.sql.BatchSQLEngine) SQLPullRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPullRequest) SQLPushRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLPushRequest) HashMap(java.util.HashMap) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) PluginConfig(io.cdap.cdap.api.plugin.PluginConfig) Strings(com.google.common.base.Strings) Files(com.google.common.io.Files) Name(io.cdap.cdap.api.annotation.Name) Gson(com.google.gson.Gson) Map(java.util.Map) StructuredRecordStringConverter(io.cdap.cdap.format.StructuredRecordStringConverter) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) SQLPullDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLPullDataset) Charsets(com.google.common.base.Charsets) SQLJoinDefinition(io.cdap.cdap.etl.api.engine.sql.request.SQLJoinDefinition) Function(com.google.common.base.Function) SQLEngineException(io.cdap.cdap.etl.api.engine.sql.SQLEngineException) SQLJoinRequest(io.cdap.cdap.etl.api.engine.sql.request.SQLJoinRequest) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) Plugin(io.cdap.cdap.api.annotation.Plugin) Schema(io.cdap.cdap.api.data.schema.Schema) Collectors(java.util.stream.Collectors) PluginClass(io.cdap.cdap.api.plugin.PluginClass) File(java.io.File) Serializable(java.io.Serializable) SQLPushDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLPushDataset) Stream(java.util.stream.Stream) RuntimeContext(io.cdap.cdap.api.RuntimeContext) SQLDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) PluginPropertyField(io.cdap.cdap.api.plugin.PluginPropertyField) Collections(java.util.Collections) Joiner(com.google.common.base.Joiner) IOException(java.io.IOException) File(java.io.File) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord)

Example 95 with StructuredRecord

use of io.cdap.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class MockPullProducer method produce.

@Override
public RecordCollection produce(SQLDataset dataset) {
    // Create a spark session and write RDD as JSON
    TypeToken<HashSet<StructuredRecord>> typeToken = new TypeToken<HashSet<StructuredRecord>>() {
    };
    Type setOfStructuredRecordType = typeToken.getType();
    // Read records from JSON and adjust data types
    Set<StructuredRecord> jsonRecords = GSON.fromJson(expected, setOfStructuredRecordType);
    Set<StructuredRecord> records = new HashSet<>();
    for (StructuredRecord jsonRecord : jsonRecords) {
        records.add(transform(jsonRecord, jsonRecord.getSchema()));
    }
    // Build RDD and generate a new Recrd Collection.
    SparkContext sc = SparkContext.getOrCreate();
    JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate());
    SQLContext sqlContext = new SQLContext(sc);
    StructType sparkSchema = DataFrames.toDataType(this.datasetDescription.getSchema());
    JavaRDD<Row> rdd = jsc.parallelize(new ArrayList<>(records)).map(sr -> DataFrames.toRow(sr, sparkSchema));
    Dataset<Row> ds = sqlContext.createDataFrame(rdd.rdd(), sparkSchema);
    return new SparkRecordCollectionImpl(ds);
}

Also used : StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) StructType(org.apache.spark.sql.types.StructType) Type(java.lang.reflect.Type) SparkContext(org.apache.spark.SparkContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) TypeToken(com.google.common.reflect.TypeToken) SparkRecordCollectionImpl(io.cdap.cdap.etl.api.sql.engine.dataset.SparkRecordCollectionImpl) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext) HashSet(java.util.HashSet)

Aggregations

StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)210 Schema (io.cdap.cdap.api.data.schema.Schema)169 Test (org.junit.Test)119 Table (io.cdap.cdap.api.dataset.table.Table)76 ETLStage (io.cdap.cdap.etl.proto.v2.ETLStage)73 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)73 AppRequest (io.cdap.cdap.proto.artifact.AppRequest)68 ApplicationManager (io.cdap.cdap.test.ApplicationManager)68 ETLBatchConfig (io.cdap.cdap.etl.proto.v2.ETLBatchConfig)59 WorkflowManager (io.cdap.cdap.test.WorkflowManager)54 HashSet (java.util.HashSet)50 ArrayList (java.util.ArrayList)44 KeyValueTable (io.cdap.cdap.api.dataset.lib.KeyValueTable)40 HashMap (java.util.HashMap)25 File (java.io.File)17 ETLPlugin (io.cdap.cdap.etl.proto.v2.ETLPlugin)16 FormatSpecification (io.cdap.cdap.api.data.format.FormatSpecification)15 DataStreamsConfig (io.cdap.cdap.etl.proto.v2.DataStreamsConfig)14 SparkManager (io.cdap.cdap.test.SparkManager)12 Map (java.util.Map)12