Search in sources :

Example 71 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ObjectStores method objectStoreProperties.

/**
   * Creates properties for {@link ObjectStore} dataset instance.
   *
   * @param type type of objects to be stored in dataset
   * @return {@link DatasetProperties} for the dataset
   * @throws UnsupportedTypeException
   */
public static DatasetProperties objectStoreProperties(Type type, DatasetProperties props) throws UnsupportedTypeException {
    Schema schema = new ReflectionSchemaGenerator().generate(type);
    TypeRepresentation typeRep = new TypeRepresentation(type);
    return DatasetProperties.builder().add("schema", schema.toString()).add("type", new Gson().toJson(typeRep)).addAll(props.getProperties()).build();
}
Also used : TypeRepresentation(co.cask.cdap.internal.io.TypeRepresentation) Schema(co.cask.cdap.api.data.schema.Schema) Gson(com.google.gson.Gson) ReflectionSchemaGenerator(co.cask.cdap.internal.io.ReflectionSchemaGenerator)

Example 72 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class RecordFormat method initialize.

/**
   * Initialize the format with the given desired schema and properties.
   * Guaranteed to be called once before any other method is called.
   *
   * @param formatSpecification the specification for the format, containing the desired schema and settings
   * @throws UnsupportedTypeException if the desired schema and properties are not supported
   */
public void initialize(@Nullable FormatSpecification formatSpecification) throws UnsupportedTypeException {
    Schema desiredSchema = null;
    Map<String, String> settings = Collections.emptyMap();
    if (formatSpecification != null) {
        desiredSchema = formatSpecification.getSchema();
        settings = formatSpecification.getSettings();
    }
    desiredSchema = desiredSchema == null ? getDefaultSchema() : desiredSchema;
    if (desiredSchema == null) {
        String msg = "A schema must be provided to the format: ";
        if (formatSpecification != null) {
            msg += formatSpecification.getName();
        }
        throw new UnsupportedTypeException(msg);
    }
    validateIsRecord(desiredSchema);
    validateSchema(desiredSchema);
    this.schema = desiredSchema;
    configure(settings);
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException)

Example 73 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class FlowVerification method verify.

/**
   * Verifies a single {@link FlowSpecification} for a {@link co.cask.cdap.api.flow.Flow}.
   *
   * @param input to be verified
   * @return An instance of {@link VerifyResult} depending of status of verification.
   */
@Override
public VerifyResult verify(ApplicationId appId, final FlowSpecification input) {
    VerifyResult verifyResult = super.verify(appId, input);
    if (!verifyResult.isSuccess()) {
        return verifyResult;
    }
    String flowName = input.getName();
    // Check if there are no flowlets.
    if (input.getFlowlets().isEmpty()) {
        return VerifyResult.failure(Err.Flow.ATLEAST_ONE_FLOWLET, flowName);
    }
    // Check if there no connections.
    if (input.getConnections().isEmpty()) {
        return VerifyResult.failure(Err.Flow.ATLEAST_ONE_CONNECTION, flowName);
    }
    // We go through each Flowlet and verify the flowlets.
    // First collect all source flowlet names
    Set<String> sourceFlowletNames = Sets.newHashSet();
    for (FlowletConnection connection : input.getConnections()) {
        if (connection.getSourceType() == FlowletConnection.Type.FLOWLET) {
            sourceFlowletNames.add(connection.getSourceName());
        }
    }
    for (Map.Entry<String, FlowletDefinition> entry : input.getFlowlets().entrySet()) {
        FlowletDefinition defn = entry.getValue();
        String flowletName = defn.getFlowletSpec().getName();
        // Check if the Flowlet Name is an ID.
        if (!EntityId.isValidId(defn.getFlowletSpec().getName())) {
            return VerifyResult.failure(Err.NOT_AN_ID, flowName + ":" + flowletName);
        }
        // We check if all the dataset names used are ids
        for (String dataSet : defn.getDatasets()) {
            if (!EntityId.isValidDatasetId(dataSet)) {
                return VerifyResult.failure(Err.NOT_AN_ID, flowName + ":" + flowletName + ":" + dataSet);
            }
        }
        // Check if the flowlet has output, it must be appear as source flowlet in at least one connection
        if (entry.getValue().getOutputs().size() > 0 && !sourceFlowletNames.contains(flowletName)) {
            return VerifyResult.failure(Err.Flow.OUTPUT_NOT_CONNECTED, flowName, flowletName);
        }
    }
    // NOTE: We should unify the logic here and the queue spec generation, as they are doing the same thing.
    Table<QueueSpecificationGenerator.Node, String, Set<QueueSpecification>> queueSpecTable = new SimpleQueueSpecificationGenerator(appId).create(input);
    // For all connections, there should be an entry in the table.
    for (FlowletConnection connection : input.getConnections()) {
        QueueSpecificationGenerator.Node node;
        if (connection.getSourceType() == FlowletConnection.Type.FLOWLET) {
            node = new QueueSpecificationGenerator.Node(connection.getSourceType(), connection.getSourceName());
        } else {
            String sourceNamespace = connection.getSourceNamespace() == null ? appId.getNamespace() : connection.getSourceNamespace();
            node = new QueueSpecificationGenerator.Node(connection.getSourceType(), sourceNamespace, connection.getSourceName());
        }
        if (!queueSpecTable.contains(node, connection.getTargetName())) {
            return VerifyResult.failure(Err.Flow.NO_INPUT_FOR_OUTPUT, flowName, connection.getTargetName(), connection.getSourceType(), connection.getSourceName());
        }
    }
    // For each output entity, check for any unconnected output
    for (QueueSpecificationGenerator.Node node : queueSpecTable.rowKeySet()) {
        // For stream output, no need to check
        if (node.getType() == FlowletConnection.Type.STREAM) {
            continue;
        }
        // For all outputs of a flowlet, remove all the matched connected schema, if there is anything left,
        // then it's a incomplete flow connection (has output not connect to any input).
        Multimap<String, Schema> outputs = toMultimap(input.getFlowlets().get(node.getName()).getOutputs());
        for (Map.Entry<String, Set<QueueSpecification>> entry : queueSpecTable.row(node).entrySet()) {
            for (QueueSpecification queueSpec : entry.getValue()) {
                outputs.remove(queueSpec.getQueueName().getSimpleName(), queueSpec.getOutputSchema());
            }
        }
        if (!outputs.isEmpty()) {
            return VerifyResult.failure(Err.Flow.MORE_OUTPUT_NOT_ALLOWED, flowName, node.getType().toString().toLowerCase(), node.getName(), outputs);
        }
    }
    return VerifyResult.success();
}
Also used : Set(java.util.Set) FlowletConnection(co.cask.cdap.api.flow.FlowletConnection) Schema(co.cask.cdap.api.data.schema.Schema) SimpleQueueSpecificationGenerator(co.cask.cdap.internal.app.queue.SimpleQueueSpecificationGenerator) FlowletDefinition(co.cask.cdap.api.flow.FlowletDefinition) QueueSpecificationGenerator(co.cask.cdap.app.queue.QueueSpecificationGenerator) SimpleQueueSpecificationGenerator(co.cask.cdap.internal.app.queue.SimpleQueueSpecificationGenerator) QueueSpecification(co.cask.cdap.app.queue.QueueSpecification) VerifyResult(co.cask.cdap.app.verification.VerifyResult) Map(java.util.Map)

Example 74 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ETLWorkerTest method testOneSourceOneSink.

@Test
@Category(SlowTests.class)
public void testOneSourceOneSink() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = new ArrayList<>();
    input.add(StructuredRecord.builder(schema).set("id", "123").set("name", "samuel").build());
    input.add(StructuredRecord.builder(schema).set("id", "456").set("name", "jackson").build());
    File tmpDir = TMP_FOLDER.newFolder();
    ETLRealtimeConfig etlConfig = ETLRealtimeConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(input))).addStage(new ETLStage("sink", MockSink.getPlugin(tmpDir))).addConnection("source", "sink").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("simpleApp");
    AppRequest<ETLRealtimeConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkerManager workerManager = appManager.getWorkerManager(ETLWorker.NAME);
    workerManager.start();
    workerManager.waitForStatus(true, 10, 1);
    try {
        List<StructuredRecord> written = MockSink.getRecords(tmpDir, 0, 10, TimeUnit.SECONDS);
        Assert.assertEquals(input, written);
    } finally {
        stopWorker(workerManager);
    }
    validateMetric(2, appId, "source.records.out");
    validateMetric(2, appId, "sink.records.in");
}
Also used : WorkerManager(co.cask.cdap.test.WorkerManager) ApplicationManager(co.cask.cdap.test.ApplicationManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ETLRealtimeConfig(co.cask.cdap.etl.proto.v2.ETLRealtimeConfig) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 75 with Schema

use of co.cask.cdap.api.data.schema.Schema in project cdap by caskdata.

the class ETLWorkerTest method testLookup.

@Test
public void testLookup() throws Exception {
    addDatasetInstance(KeyValueTable.class.getName(), "lookupTable");
    DataSetManager<KeyValueTable> lookupTable = getDataset("lookupTable");
    lookupTable.get().write("Bob".getBytes(Charsets.UTF_8), "123".getBytes(Charsets.UTF_8));
    lookupTable.flush();
    File outDir = TMP_FOLDER.newFolder();
    ETLRealtimeConfig etlConfig = ETLRealtimeConfig.builder().addStage(new ETLStage("source", LookupSource.getPlugin(ImmutableSet.of("Bob", "Bill"), "lookupTable"))).addStage(new ETLStage("sink", MockSink.getPlugin(outDir))).addConnection("source", "sink").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lookupTestApp");
    AppRequest<ETLRealtimeConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkerManager workerManager = appManager.getWorkerManager(ETLWorker.NAME);
    workerManager.start();
    workerManager.waitForStatus(true, 10, 1);
    Schema schema = Schema.recordOf("bobbill", Schema.Field.of("Bob", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("Bill", Schema.nullableOf(Schema.of(Schema.Type.STRING))));
    List<StructuredRecord> expected = new ArrayList<>();
    expected.add(StructuredRecord.builder(schema).set("Bob", "123").build());
    try {
        List<StructuredRecord> actual = MockSink.getRecords(outDir, 0, 10, TimeUnit.SECONDS);
        Assert.assertEquals(expected, actual);
    } finally {
        stopWorker(workerManager);
    }
    validateMetric(1, appId, "source.records.out");
    validateMetric(1, appId, "sink.records.in");
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) ETLRealtimeConfig(co.cask.cdap.etl.proto.v2.ETLRealtimeConfig) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) AppRequest(co.cask.cdap.proto.artifact.AppRequest) WorkerManager(co.cask.cdap.test.WorkerManager) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) ApplicationId(co.cask.cdap.proto.id.ApplicationId) File(java.io.File) Test(org.junit.Test)

Aggregations

Schema (co.cask.cdap.api.data.schema.Schema)210 Test (org.junit.Test)92 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)69 Table (co.cask.cdap.api.dataset.table.Table)38 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)35 ApplicationId (co.cask.cdap.proto.id.ApplicationId)34 FormatSpecification (co.cask.cdap.api.data.format.FormatSpecification)32 ApplicationManager (co.cask.cdap.test.ApplicationManager)30 AppRequest (co.cask.cdap.proto.artifact.AppRequest)29 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)24 IOException (java.io.IOException)23 ETLBatchConfig (co.cask.cdap.etl.proto.v2.ETLBatchConfig)22 ReflectionSchemaGenerator (co.cask.cdap.internal.io.ReflectionSchemaGenerator)22 ArrayList (java.util.ArrayList)22 WorkflowManager (co.cask.cdap.test.WorkflowManager)20 Map (java.util.Map)18 Set (java.util.Set)14 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)12 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11