use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class ReadUserDatasetGenericOnePartition method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Load the users dataset
Dataset<GenericRecord> users = repo.load("users");
// Get the partition strategy and use it to construct a partition key for
// hash(username)=0
PartitionStrategy partitionStrategy = users.getDescriptor().getPartitionStrategy();
PartitionKey partitionKey = partitionStrategy.partitionKey(0);
// Get the dataset partition for the partition key
Dataset<GenericRecord> partition = users.getPartition(partitionKey, false);
// Get a reader for the partition and read all the users
DatasetReader<GenericRecord> reader = partition.newReader();
try {
reader.open();
for (GenericRecord user : reader) {
System.out.println(user);
}
} finally {
reader.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class LoggingServlet method init.
@Override
public void init() throws ServletException {
// Find the schema from the repository
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
this.schema = repo.load("events").getDescriptor().getSchema();
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class App method run.
@Override
public int run(String[] args) throws Exception {
// Get a log4j logger
Logger logger = Logger.getLogger(App.class);
// Find the schema from the repository
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
Schema schema = repo.load("events").getDescriptor().getSchema();
// Build some events using the generic Avro API and log them using log4j
GenericRecordBuilder builder = new GenericRecordBuilder(schema);
for (long i = 0; i < 10; i++) {
GenericRecord event = builder.set("id", i).set("message", "Hello " + i).build();
System.out.println("Sending to log4j: " + event);
logger.info(event);
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class ReadMovies method run.
@Override
public int run(String[] args) throws Exception {
DatasetRepository repo = DatasetRepositories.open("repo:hdfs://localhost:8020/user/cloudera");
Dataset movies = repo.load("movies");
DatasetReader reader = movies.newReader();
try {
reader.open();
for (Object rec : reader) {
System.err.println("Movie: " + rec);
}
} finally {
reader.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class CreateStagedDataset method run.
@Override
public int run(String[] args) throws Exception {
DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");
// where the schema is stored
URI schemaURI = URI.create("resource:simple-log.avsc");
// create a Parquet dataset for long-term storage
repo.create("logs", new DatasetDescriptor.Builder().format(Formats.PARQUET).schemaUri(schemaURI).partitionStrategy(new PartitionStrategy.Builder().year("timestamp", "year").month("timestamp", "month").day("timestamp", "day").build()).build());
// create an Avro dataset to temporarily hold data
repo.create("logs-staging", new DatasetDescriptor.Builder().format(Formats.AVRO).schemaUri(schemaURI).partitionStrategy(new PartitionStrategy.Builder().day("timestamp", "day").build()).build());
return 0;
}
Aggregations