Search in sources :

Example 96 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class ReflectionTableTest method testStructuredRecordRepresentation.

@Test
public void testStructuredRecordRepresentation() throws Exception {
    dsFrameworkUtil.createInstance("table", users, DatasetProperties.builder().build());
    try {
        final Table usersTable = dsFrameworkUtil.getInstance(users);
        final byte[] rowKey = Bytes.toBytes(123);
        final Schema schema = new ReflectionSchemaGenerator().generate(User.class);
        // TableDataset is not accessible here, but we know that's the underlying implementation...
        TransactionExecutor tx = dsFrameworkUtil.newTransactionExecutor((TransactionAware) usersTable);
        tx.execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                Put put = new Put(rowKey);
                ReflectionPutWriter<User> putWriter = new ReflectionPutWriter<>(schema);
                putWriter.write(SAMUEL, put);
                usersTable.put(put);
                Row row = usersTable.get(rowKey);
                ReflectionRowRecordReader rowReader = new ReflectionRowRecordReader(schema, null);
                StructuredRecord actual = rowReader.read(row, schema);
                assertRecordEqualsUser(SAMUEL, actual);
            }
        });
    } finally {
        dsFrameworkUtil.deleteInstance(users);
    }
}
Also used : Table(co.cask.cdap.api.dataset.table.Table) Schema(co.cask.cdap.api.data.schema.Schema) TransactionExecutor(org.apache.tephra.TransactionExecutor) ReflectionSchemaGenerator(co.cask.cdap.internal.io.ReflectionSchemaGenerator) Put(co.cask.cdap.api.dataset.table.Put) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) ReflectionPutWriter(co.cask.cdap.internal.io.ReflectionPutWriter) Row(co.cask.cdap.api.dataset.table.Row) ReflectionRowRecordReader(co.cask.cdap.internal.io.ReflectionRowRecordReader) Test(org.junit.Test)

Example 97 with StructuredRecord

use of co.cask.cdap.api.data.format.StructuredRecord in project cdap by caskdata.

the class StreamFormatSpecSpark method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    SQLContext sqlContext = new SQLContext(jsc);
    // Read from CSV stream and turn it into a DataFrame
    String streamName = sec.getRuntimeArguments().get("stream.name");
    Schema schema = Schema.recordOf("record", ImmutableList.of(Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT))));
    FormatSpecification formatSpec = new FormatSpecification("csv", schema);
    JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class);
    JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() {

        @Override
        public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception {
            StructuredRecord record = data.getBody();
            return new Person(record.<String>get("name"), record.<Integer>get("age"));
        }
    });
    sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people");
    // Execute a SQL on the table and save the result
    JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")).toJavaRDD().mapToPair(new PairFunction<Row, String, Integer>() {

        @Override
        public Tuple2<String, Integer> call(Row row) throws Exception {
            return new Tuple2<>(row.getString(0), row.getInt(1));
        }
    });
    sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset"));
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)97 Schema (co.cask.cdap.api.data.schema.Schema)71 Test (org.junit.Test)51 Table (co.cask.cdap.api.dataset.table.Table)36 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)36 ApplicationId (co.cask.cdap.proto.id.ApplicationId)36 ApplicationManager (co.cask.cdap.test.ApplicationManager)33 AppRequest (co.cask.cdap.proto.artifact.AppRequest)31 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)25 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)25 WorkflowManager (co.cask.cdap.test.WorkflowManager)23 ArrayList (java.util.ArrayList)20 StreamEvent (co.cask.cdap.api.flow.flowlet.StreamEvent)19 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)18 HashSet (java.util.HashSet)10 DataStreamsConfig (co.cask.cdap.etl.proto.v2.DataStreamsConfig)8 File (java.io.File)8 TimeoutException (java.util.concurrent.TimeoutException)8 Put (co.cask.cdap.api.dataset.table.Put)7 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)7