Search in sources :

Example 6 with Stream

use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.

the class WikipediaPipelineApp method configure.

@Override
public void configure() {
    addStream(new Stream(PAGE_TITLES_STREAM));
    addStream(new Stream(RAW_WIKIPEDIA_STREAM));
    addMapReduce(new StreamToDataset(LIKES_TO_DATASET_MR_NAME));
    addMapReduce(new StreamToDataset(WIKIPEDIA_TO_DATASET_MR_NAME));
    addMapReduce(new WikipediaDataDownloader());
    addMapReduce(new WikiContentValidatorAndNormalizer());
    addMapReduce(new TopNMapReduce());
    addSpark(new SparkWikipediaClustering(getConfig()));
    createDataset(PAGE_TITLES_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Page titles dataset").build());
    createDataset(RAW_WIKIPEDIA_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Raw Wikipedia dataset").build());
    createDataset(NORMALIZED_WIKIPEDIA_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Normalized Wikipedia dataset").build());
    createDataset(SPARK_CLUSTERING_OUTPUT_DATASET, Table.class, DatasetProperties.builder().setDescription("Spark clustering output dataset").build());
    createDataset(MAPREDUCE_TOPN_OUTPUT, KeyValueTable.class, DatasetProperties.builder().setDescription("MapReduce top-'N'-words output dataset").build());
    addWorkflow(new WikipediaPipelineWorkflow(getConfig()));
    addService(new WikipediaService());
}
Also used : Stream(co.cask.cdap.api.data.stream.Stream)

Example 7 with Stream

use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.

the class StreamConversionApp method configure.

@Override
public void configure() {
    addStream(new Stream("events"));
    addMapReduce(new StreamConversionMapReduce());
    addWorkflow(new StreamConversionWorkflow());
    schedule(buildSchedule("every5min", ProgramType.WORKFLOW, "StreamConversionWorkflow").setDescription("runs every 5 minutes").triggerByTime("*/5 * * * *"));
    // create the time-partitioned file set, configure it to work with MapReduce and with Explore
    createDataset("converted", TimePartitionedFileSet.class, FileSetProperties.builder().setBasePath("converted").setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setOutputProperty("schema", SCHEMA_STRING).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA_STRING).setDescription("Converted stream events dataset").build());
}
Also used : AvroKeyOutputFormat(org.apache.avro.mapreduce.AvroKeyOutputFormat) Stream(co.cask.cdap.api.data.stream.Stream)

Example 8 with Stream

use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.

the class UserProfiles method configure.

@Override
public void configure() {
    setName("UserProfiles");
    setDescription("Demonstrates the use of column-level conflict detection");
    addStream(new Stream("events"));
    addFlow(new ActivityFlow());
    addService(new UserProfileService());
    createDataset("counters", KeyValueTable.class, DatasetProperties.builder().setDescription("Counters key-value table").build());
    // create the profiles table with a schema so that it can be explored via Hive
    Schema profileSchema = Schema.recordOf("profile", // id, name, and email are never null and are set when a user profile is created
    Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("email", Schema.of(Schema.Type.STRING)), // login and active are never set when a profile is created but are set later, so they are nullable.
    Schema.Field.of("login", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("active", Schema.nullableOf(Schema.of(Schema.Type.LONG))));
    createDataset("profiles", Table.class.getName(), TableProperties.builder().setConflictDetection(ConflictDetection.COLUMN).setSchema(profileSchema).setRowFieldName("id").setDescription("Profiles table with column-level conflict detection").build());
}
Also used : Table(co.cask.cdap.api.dataset.table.Table) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Schema(co.cask.cdap.api.data.schema.Schema) Stream(co.cask.cdap.api.data.stream.Stream)

Example 9 with Stream

use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.

the class SparkPageRankApp method configure.

@Override
public void configure() {
    setName("SparkPageRank");
    setDescription("Spark page rank application.");
    // Ingest data into the Application via a Stream
    addStream(new Stream(BACKLINK_URL_STREAM));
    // Run a Spark program on the acquired data
    addSpark(new PageRankSpark());
    // Runs MapReduce program on data emitted by Spark program
    addMapReduce(new RanksCounter());
    // Runs Spark followed by a MapReduce in a Workflow
    addWorkflow(new PageRankWorkflow());
    // Service to retrieve process data
    addService(SERVICE_HANDLERS, new SparkPageRankServiceHandler());
    // Store input and processed data in ObjectStore Datasets
    try {
        ObjectStores.createObjectStore(getConfigurer(), "ranks", Integer.class, DatasetProperties.builder().setDescription("Ranks Dataset").build());
        ObjectStores.createObjectStore(getConfigurer(), "rankscount", Integer.class, DatasetProperties.builder().setDescription("Ranks Count Dataset").build());
    } catch (UnsupportedTypeException e) {
        // because String and Double are actual classes.
        throw new RuntimeException(e);
    }
}
Also used : UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) Stream(co.cask.cdap.api.data.stream.Stream)

Example 10 with Stream

use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.

the class NoMapperApp method configure.

@Override
public void configure() {
    addStream(new Stream("nomapper"));
    createDataset("results", KeyValueTable.class);
    addMapReduce(new NoMapperMapReduce());
}
Also used : Stream(co.cask.cdap.api.data.stream.Stream)

Aggregations

Stream (co.cask.cdap.api.data.stream.Stream)48 UnsupportedTypeException (co.cask.cdap.api.data.schema.UnsupportedTypeException)10 BasicService (co.cask.cdap.api.service.BasicService)6 InputStream (java.io.InputStream)2 Schema (co.cask.cdap.api.data.schema.Schema)1 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)1 Table (co.cask.cdap.api.dataset.table.Table)1 AvroKeyOutputFormat (org.apache.avro.mapreduce.AvroKeyOutputFormat)1 TextOutputFormat (org.apache.hadoop.mapreduce.lib.output.TextOutputFormat)1