use of org.apache.avro.generic.GenericRecord in project cdk-examples by cloudera.
the class CreateHCatalogUserDatasetGeneric method run.
@Override
public int run(String[] args) throws Exception {
// Construct an HCatalog dataset repository using managed Hive tables
DatasetRepository repo = DatasetRepositories.open("repo:hive");
// Create a dataset of users with the Avro schema in the repository
DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schemaUri("resource:user.avsc").build();
Dataset<GenericRecord> users = repo.create("users", descriptor);
// Get a writer for the dataset and write some users to it
DatasetWriter<GenericRecord> writer = users.newWriter();
try {
writer.open();
String[] colors = { "green", "blue", "pink", "brown", "yellow" };
Random rand = new Random();
GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
for (int i = 0; i < 100; i++) {
GenericRecord record = builder.set("username", "user-" + i).set("creationDate", System.currentTimeMillis()).set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
writer.write(record);
}
} finally {
writer.close();
}
return 0;
}
use of org.apache.avro.generic.GenericRecord in project cdk-examples by cloudera.
the class ReadHCatalogUserDatasetGeneric method run.
@Override
public int run(String[] args) throws Exception {
// Construct an HCatalog dataset repository using managed Hive tables
DatasetRepository repo = DatasetRepositories.open("repo:hive");
// Load the users dataset
Dataset<GenericRecord> users = repo.load("users");
// Get a reader for the dataset and read all the users
DatasetReader<GenericRecord> reader = users.newReader();
try {
reader.open();
for (GenericRecord user : reader) {
System.out.println(user);
}
} finally {
reader.close();
}
return 0;
}
use of org.apache.avro.generic.GenericRecord in project cdk-examples by cloudera.
the class StagingToPersistentSerial method run.
@Override
public int run(String[] args) throws Exception {
// open the repository
final DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");
final Calendar now = Calendar.getInstance();
final long yesterdayTimestamp = now.getTimeInMillis() - DAY_IN_MILLIS;
// the destination dataset
final Dataset<GenericRecord> persistent = repo.load("logs");
final DatasetWriter<GenericRecord> writer = persistent.newWriter();
writer.open();
// the source dataset: yesterday's partition in the staging area
final Dataset<GenericRecord> staging = repo.load("logs-staging");
final PartitionKey yesterday = getPartitionKey(staging, yesterdayTimestamp);
final DatasetReader<GenericRecord> reader = staging.getPartition(yesterday, false).newReader();
try {
reader.open();
// yep, it's that easy.
for (GenericRecord record : reader) {
writer.write(record);
}
} finally {
reader.close();
writer.flush();
}
// remove the source data partition from staging
staging.dropPartition(yesterday);
// if the above didn't throw an exception, commit the data
writer.close();
return 0;
}
use of org.apache.avro.generic.GenericRecord in project cdk-examples by cloudera.
the class ReadUserDatasetGeneric method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Load the users dataset
Dataset<GenericRecord> users = repo.load("users");
// Get a reader for the dataset and read all the users
DatasetReader<GenericRecord> reader = users.newReader();
try {
reader.open();
for (GenericRecord user : reader) {
System.out.println(user);
}
} finally {
reader.close();
}
return 0;
}
use of org.apache.avro.generic.GenericRecord in project cdk-examples by cloudera.
the class ReadUserDatasetGenericOnePartition method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Load the users dataset
Dataset<GenericRecord> users = repo.load("users");
// Get the partition strategy and use it to construct a partition key for
// hash(username)=0
PartitionStrategy partitionStrategy = users.getDescriptor().getPartitionStrategy();
PartitionKey partitionKey = partitionStrategy.partitionKey(0);
// Get the dataset partition for the partition key
Dataset<GenericRecord> partition = users.getPartition(partitionKey, false);
// Get a reader for the partition and read all the users
DatasetReader<GenericRecord> reader = partition.newReader();
try {
reader.open();
for (GenericRecord user : reader) {
System.out.println(user);
}
} finally {
reader.close();
}
return 0;
}
Aggregations