use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class GenerateSimpleLogs method run.
@Override
public int run(String[] args) throws Exception {
// going to generate a lot of random log messages
final Random rand = new Random();
// open the repository
final DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");
// data is written to the staging dataset
final Dataset<GenericRecord> staging = repo.load("logs-staging");
final DatasetWriter<GenericRecord> writer = staging.newWriter();
// this is going to build our simple log records
final GenericRecordBuilder builder = new GenericRecordBuilder(staging.getDescriptor().getSchema());
// generate timestamps 1 second apart starting... now
final Calendar now = Calendar.getInstance();
final long yesterday = now.getTimeInMillis() - DAY_IN_MILLIS;
try {
writer.open();
// this is a little less than 24 hours worth of messages
for (int second : Ranges.closed(0, 15000).asSet(DiscreteDomains.integers())) {
LOG.info("Generating log message " + second);
builder.set("timestamp", yesterday + second * 5000);
builder.set("component", "GenerateSimpleLogs");
int level = rand.nextInt(LOG_LEVELS.length);
builder.set("level", LOG_LEVELS[level]);
builder.set("message", LOG_MESSAGES[level]);
writer.write(builder.build());
}
} finally {
writer.flush();
writer.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class CreateUserDatasetGeneric method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Create a dataset of users with the Avro schema in the repository
DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri("resource:user.avsc").build();
Dataset<GenericRecord> users = repo.create("users", descriptor);
// Get a writer for the dataset and write some users to it
DatasetWriter<GenericRecord> writer = users.newWriter();
try {
writer.open();
String[] colors = { "green", "blue", "pink", "brown", "yellow" };
Random rand = new Random();
GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
for (int i = 0; i < 100; i++) {
GenericRecord record = builder.set("username", "user-" + i).set("creationDate", System.currentTimeMillis()).set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
writer.write(record);
}
} finally {
writer.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class CreateUserDatasetGenericParquet method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Create a dataset of users with the Avro schema, and Parquet format in the
// repository
DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri("resource:user.avsc").format(Formats.PARQUET).build();
Dataset<GenericRecord> users = repo.create("users", descriptor);
// Get a writer for the dataset and write some users to it
DatasetWriter<GenericRecord> writer = users.newWriter();
try {
writer.open();
String[] colors = { "green", "blue", "pink", "brown", "yellow" };
Random rand = new Random();
GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
for (int i = 0; i < 100; i++) {
GenericRecord record = builder.set("username", "user-" + i).set("creationDate", System.currentTimeMillis()).set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
writer.write(record);
}
} finally {
writer.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class CreateUserDatasetGenericPartitioned method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Create a partition strategy that hash partitions on username with 10 buckets
PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash("username", 10).build();
// Create a dataset of users with the Avro schema in the repository
DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri("resource:user.avsc").partitionStrategy(partitionStrategy).build();
Dataset<GenericRecord> users = repo.create("users", descriptor);
// Get a writer for the dataset and write some users to it
DatasetWriter<GenericRecord> writer = users.newWriter();
try {
writer.open();
String[] colors = { "green", "blue", "pink", "brown", "yellow" };
Random rand = new Random();
GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
for (int i = 0; i < 100; i++) {
GenericRecord record = builder.set("username", "user-" + i).set("creationDate", System.currentTimeMillis()).set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
writer.write(record);
}
} finally {
writer.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class ReadProductDatasetPojo method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Load the products dataset
Dataset<Product> products = repo.load("products");
// Get a reader for the dataset and read all the products
DatasetReader<Product> reader = products.newReader();
try {
reader.open();
for (Product product : reader) {
System.out.println(product);
}
} finally {
reader.close();
}
return 0;
}
Aggregations