Examples with Schema - co.cask.cdap.api.data.schema.Schema

Example 1 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ConnectorSink method modifyRecord.

private StructuredRecord modifyRecord(KeyValue<String, StructuredRecord> input) throws IOException {
    String stageName = input.getKey();
    Schema inputSchema = input.getValue().getSchema();
    return StructuredRecord.builder(ConnectorSource.RECORD_WITH_SCHEMA).set("stageName", stageName).set("schema", inputSchema.toString()).set("record", StructuredRecordStringConverter.toJsonString(input.getValue())).build();
}

Also used : Schema(co.cask.cdap.api.data.schema.Schema)

Example 2 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ETLWorkerTest method testOneSourceOneSink.

@Test
@Category(SlowTests.class)
public void testOneSourceOneSink() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    input.add(StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build());
    input.add(StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build());
    File tmpDir = TMP_FOLDER.newFolder();
    ETLRealtimeConfig etlConfig = ETLRealtimeConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(input))).addStage(new ETLStage("sink", MockSink.getPlugin(tmpDir))).addConnection("source", "sink").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
    AppRequest<ETLRealtimeConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkerManager workerManager = appManager.getWorkerManager(ETLWorker.NAME);
    workerManager.start();
    workerManager.waitForStatus(true, 10, 1);
    try {
        List<StructuredRecord> written = MockSink.getRecords(tmpDir, 0, 10, TimeUnit.SECONDS);
        Assert.assertEquals(input, written);
    } finally {
        stopWorker(workerManager);
    }
    validateMetric(2, appId, "source.records.out");
    validateMetric(2, appId, "sink.records.in");
}

Also used : WorkerManager(co.cask.cdap.test.WorkerManager) ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ETLRealtimeConfig(co.cask.cdap.etl.proto.v2.ETLRealtimeConfig) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 3 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ETLWorkerTest method testLookup.

@Test
public void testLookup() throws Exception {
    addDatasetInstance(KeyValueTable.class.getName(), "lookupTable");
    DataSetManager<KeyValueTable> lookupTable = getDataset("lookupTable");
    lookupTable.get().write("Bob".getBytes(Charsets.UTF_8), "123".getBytes(Charsets.UTF_8));
    lookupTable.flush();
    File outDir = TMP_FOLDER.newFolder();
    ETLRealtimeConfig etlConfig = ETLRealtimeConfig.builder().addStage(new ETLStage("source", LookupSource.getPlugin(ImmutableSet.of("Bob", "Bill"), "lookupTable"))).addStage(new ETLStage("sink", MockSink.getPlugin(outDir))).addConnection("source", "sink").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lookupTestApp");
    AppRequest<ETLRealtimeConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkerManager workerManager = appManager.getWorkerManager(ETLWorker.NAME);
    workerManager.start();
    workerManager.waitForStatus(true, 10, 1);
    Schema schema = Schema.recordOf("bobbill", Schema.Field.of("Bob", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("Bill", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    List<StructuredRecord> expected = new ArrayList<>();
    expected.add(StructuredRecord.builder(schema).set("Bob", "123").build());
    try {
        List<StructuredRecord> actual = MockSink.getRecords(outDir, 0, 10, TimeUnit.SECONDS);
        Assert.assertEquals(expected, actual);
    } finally {
        stopWorker(workerManager);
    }
    validateMetric(1, appId, "source.records.out");
    validateMetric(1, appId, "sink.records.in");
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ETLRealtimeConfig(co.cask.cdap.etl.proto.v2.ETLRealtimeConfig) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) WorkerManager(co.cask.cdap.test.WorkerManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File) Test(org.junit.Test)

Example 4 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class FileStreamAdmin method updateConfig.

@Override
public void updateConfig(final StreamId streamId, final StreamProperties properties) throws Exception {
    Location streamLocation;
    streamLocation = impersonator.doAs(streamId.getParent(), new Callable<Location>() {

        @Override
        public Location call() throws Exception {
            return getStreamLocation(streamId);
        }
    });
    Preconditions.checkArgument(streamLocation.isDirectory(), "Stream '%s' does not exist.", streamId);
    SecurityUtil.verifyOwnerPrincipal(streamId, properties.getOwnerPrincipal(), ownerAdmin);
    streamCoordinatorClient.updateProperties(streamId, new Callable<CoordinatorStreamProperties>() {

        @Override
        public CoordinatorStreamProperties call() throws Exception {
            StreamProperties oldProperties = updateProperties(streamId, properties);
            FormatSpecification format = properties.getFormat();
            if (format != null) {
                // if the schema has changed, we need to recreate the hive table.
                // Changes in format and settings don't require
                // a hive change, as they are just properties used by the stream storage handler.
                Schema currSchema = oldProperties.getFormat().getSchema();
                Schema newSchema = format.getSchema();
                if (!Objects.equals(currSchema, newSchema)) {
                    alterExploreStream(streamId, false, null);
                    alterExploreStream(streamId, true, format);
                }
            }
            publishAudit(streamId, AuditType.UPDATE);
            return new CoordinatorStreamProperties(properties.getTTL(), properties.getFormat(), properties.getNotificationThresholdMB(), null, properties.getDescription(), properties.getOwnerPrincipal());
        }
    });
}

Also used : CoordinatorStreamProperties(co.cask.cdap.data.stream.CoordinatorStreamProperties) Schema(co.cask.cdap.api.data.schema.Schema) StreamProperties(co.cask.cdap.proto.StreamProperties) CoordinatorStreamProperties(co.cask.cdap.data.stream.CoordinatorStreamProperties) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) Callable(java.util.concurrent.Callable) NotificationFeedException(co.cask.cdap.notifications.feeds.NotificationFeedException) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) NotFoundException(co.cask.cdap.common.NotFoundException) StreamNotFoundException(co.cask.cdap.common.StreamNotFoundException) Location(org.apache.twill.filesystem.Location)

Example 5 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ClassicSparkProgram method main.

public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
    Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    List<StructuredRecord> records = new ArrayList<>();
    for (int i = 1; i <= 10; i++) {
        records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
    }
    // This test serialization of StructuredRecord as well as using custom kryo serializer
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {

        @Override
        public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
            return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
        }
    }).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {

        @Override
        public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
            return tuple._1;
        }
    }).reduce(new Function2<MyInt, MyInt, MyInt>() {

        @Override
        public MyInt call(MyInt v1, MyInt v2) throws Exception {
            return new MyInt(v1.toInt() + v2.toInt());
        }
    }).toInt();
    if (result != 55) {
        throw new Exception("Expected result to be 55");
    }
}

Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Aggregations

Schema (co.cask.cdap.api.data.schema.Schema)245 Test (org.junit.Test)112 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)88 Table (co.cask.cdap.api.dataset.table.Table)54 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)50 ApplicationId (co.cask.cdap.proto.id.ApplicationId)49 ApplicationManager (co.cask.cdap.test.ApplicationManager)45 AppRequest (co.cask.cdap.proto.artifact.AppRequest)44 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)37 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)35 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)33 WorkflowManager (co.cask.cdap.test.WorkflowManager)31 ArrayList (java.util.ArrayList)26 ReflectionSchemaGenerator (co.cask.cdap.internal.io.ReflectionSchemaGenerator)23 IOException (java.io.IOException)23 Map (java.util.Map)22 HashMap (java.util.HashMap)18 Set (java.util.Set)16 HashSet (java.util.HashSet)14 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)12