use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.
the class WikipediaPipelineApp method configure.
@Override
public void configure() {
addStream(new Stream(PAGE_TITLES_STREAM));
addStream(new Stream(RAW_WIKIPEDIA_STREAM));
addMapReduce(new StreamToDataset(LIKES_TO_DATASET_MR_NAME));
addMapReduce(new StreamToDataset(WIKIPEDIA_TO_DATASET_MR_NAME));
addMapReduce(new WikipediaDataDownloader());
addMapReduce(new WikiContentValidatorAndNormalizer());
addMapReduce(new TopNMapReduce());
addSpark(new SparkWikipediaClustering(getConfig()));
createDataset(PAGE_TITLES_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Page titles dataset").build());
createDataset(RAW_WIKIPEDIA_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Raw Wikipedia dataset").build());
createDataset(NORMALIZED_WIKIPEDIA_DATASET, KeyValueTable.class, DatasetProperties.builder().setDescription("Normalized Wikipedia dataset").build());
createDataset(SPARK_CLUSTERING_OUTPUT_DATASET, Table.class, DatasetProperties.builder().setDescription("Spark clustering output dataset").build());
createDataset(MAPREDUCE_TOPN_OUTPUT, KeyValueTable.class, DatasetProperties.builder().setDescription("MapReduce top-'N'-words output dataset").build());
addWorkflow(new WikipediaPipelineWorkflow(getConfig()));
addService(new WikipediaService());
}
use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.
the class StreamConversionApp method configure.
@Override
public void configure() {
addStream(new Stream("events"));
addMapReduce(new StreamConversionMapReduce());
addWorkflow(new StreamConversionWorkflow());
schedule(buildSchedule("every5min", ProgramType.WORKFLOW, "StreamConversionWorkflow").setDescription("runs every 5 minutes").triggerByTime("*/5 * * * *"));
// create the time-partitioned file set, configure it to work with MapReduce and with Explore
createDataset("converted", TimePartitionedFileSet.class, FileSetProperties.builder().setBasePath("converted").setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setOutputProperty("schema", SCHEMA_STRING).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA_STRING).setDescription("Converted stream events dataset").build());
}
use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.
the class UserProfiles method configure.
@Override
public void configure() {
setName("UserProfiles");
setDescription("Demonstrates the use of column-level conflict detection");
addStream(new Stream("events"));
addFlow(new ActivityFlow());
addService(new UserProfileService());
createDataset("counters", KeyValueTable.class, DatasetProperties.builder().setDescription("Counters key-value table").build());
// create the profiles table with a schema so that it can be explored via Hive
Schema profileSchema = Schema.recordOf("profile", // id, name, and email are never null and are set when a user profile is created
Schema.Field.of("id", Schema.of(Schema.Type.STRING)), Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("email", Schema.of(Schema.Type.STRING)), // login and active are never set when a profile is created but are set later, so they are nullable.
Schema.Field.of("login", Schema.nullableOf(Schema.of(Schema.Type.LONG))), Schema.Field.of("active", Schema.nullableOf(Schema.of(Schema.Type.LONG))));
createDataset("profiles", Table.class.getName(), TableProperties.builder().setConflictDetection(ConflictDetection.COLUMN).setSchema(profileSchema).setRowFieldName("id").setDescription("Profiles table with column-level conflict detection").build());
}
use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.
the class SparkPageRankApp method configure.
@Override
public void configure() {
setName("SparkPageRank");
setDescription("Spark page rank application.");
// Ingest data into the Application via a Stream
addStream(new Stream(BACKLINK_URL_STREAM));
// Run a Spark program on the acquired data
addSpark(new PageRankSpark());
// Runs MapReduce program on data emitted by Spark program
addMapReduce(new RanksCounter());
// Runs Spark followed by a MapReduce in a Workflow
addWorkflow(new PageRankWorkflow());
// Service to retrieve process data
addService(SERVICE_HANDLERS, new SparkPageRankServiceHandler());
// Store input and processed data in ObjectStore Datasets
try {
ObjectStores.createObjectStore(getConfigurer(), "ranks", Integer.class, DatasetProperties.builder().setDescription("Ranks Dataset").build());
ObjectStores.createObjectStore(getConfigurer(), "rankscount", Integer.class, DatasetProperties.builder().setDescription("Ranks Count Dataset").build());
} catch (UnsupportedTypeException e) {
// because String and Double are actual classes.
throw new RuntimeException(e);
}
}
use of co.cask.cdap.api.data.stream.Stream in project cdap by caskdata.
the class NoMapperApp method configure.
@Override
public void configure() {
addStream(new Stream("nomapper"));
createDataset("results", KeyValueTable.class);
addMapReduce(new NoMapperMapReduce());
}
Aggregations