use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class Hello method run.
@Override
public int run(String[] args) throws Exception {
// Construct a local filesystem dataset repository rooted at /tmp/hello-cdk
DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/hello-cdk");
// Create a dataset of Hellos
DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(Hello.class).build();
Dataset<Hello> hellos = repo.create("hellos", descriptor);
// Write some Hellos in to the dataset
DatasetWriter<Hello> writer = hellos.newWriter();
try {
writer.open();
Hello cdk = new Hello("CDK");
writer.write(cdk);
} finally {
writer.close();
}
// Read the Hellos from the dataset
DatasetReader<Hello> reader = hellos.newReader();
try {
reader.open();
for (Hello hello : reader) {
hello.sayHello();
}
} finally {
reader.close();
}
// Delete the dataset now that we are done with it
repo.delete("hellos");
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class ReadHCatalogUserDatasetGeneric method run.
@Override
public int run(String[] args) throws Exception {
// Construct an HCatalog dataset repository using managed Hive tables
DatasetRepository repo = DatasetRepositories.open("repo:hive");
// Load the users dataset
Dataset<GenericRecord> users = repo.load("users");
// Get a reader for the dataset and read all the users
DatasetReader<GenericRecord> reader = users.newReader();
try {
reader.open();
for (GenericRecord user : reader) {
System.out.println(user);
}
} finally {
reader.close();
}
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class DescribeDatasets method run.
@Override
public int run(String[] args) throws Exception {
DatasetRepository repo = DatasetRepositories.open("repo:hdfs://localhost:8020/user/cloudera");
Schema ratingSchema = SchemaBuilder.record("Rating").fields().name("userId").type().intType().noDefault().name("movieId").type().intType().noDefault().name("rating").type().intType().noDefault().name("timeInSeconds").type().intType().noDefault().endRecord();
// create
repo.create("ratings", new DatasetDescriptor.Builder().format(Formats.CSV).property("cdk.csv.delimiter", "\t").schema(ratingSchema).build());
// movie id | movie title | release date | video release date |
// IMDb URL | unknown | Action | Adventure | Animation |
// Children's | Comedy | Crime | Documentary | Drama | Fantasy |
// Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
// Thriller | War | Western |
Schema movieSchema = SchemaBuilder.record("Movie").fields().name("movieId").type().intType().noDefault().name("title").type().stringType().noDefault().name("releaseDate").type().stringType().noDefault().name("videoReleaseDate").type().stringType().noDefault().name("imdbURL").type().stringType().noDefault().endRecord();
repo.create("movies", new DatasetDescriptor.Builder().format(Formats.CSV).property("cdk.csv.delimiter", "|").schema(movieSchema).build());
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class StagingToPersistentSerial method run.
@Override
public int run(String[] args) throws Exception {
// open the repository
final DatasetRepository repo = DatasetRepositories.open("repo:file:/tmp/data");
final Calendar now = Calendar.getInstance();
final long yesterdayTimestamp = now.getTimeInMillis() - DAY_IN_MILLIS;
// the destination dataset
final Dataset<GenericRecord> persistent = repo.load("logs");
final DatasetWriter<GenericRecord> writer = persistent.newWriter();
writer.open();
// the source dataset: yesterday's partition in the staging area
final Dataset<GenericRecord> staging = repo.load("logs-staging");
final PartitionKey yesterday = getPartitionKey(staging, yesterdayTimestamp);
final DatasetReader<GenericRecord> reader = staging.getPartition(yesterday, false).newReader();
try {
reader.open();
// yep, it's that easy.
for (GenericRecord record : reader) {
writer.write(record);
}
} finally {
reader.close();
writer.flush();
}
// remove the source data partition from staging
staging.dropPartition(yesterday);
// if the above didn't throw an exception, commit the data
writer.close();
return 0;
}
use of com.cloudera.cdk.data.DatasetRepository in project cdk-examples by cloudera.
the class ReadUserDatasetGeneric method run.
@Override
public int run(String[] args) throws Exception {
// Construct a filesystem dataset repository rooted at /tmp/data
DatasetRepository repo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Load the users dataset
Dataset<GenericRecord> users = repo.load("users");
// Get a reader for the dataset and read all the users
DatasetReader<GenericRecord> reader = users.newReader();
try {
reader.open();
for (GenericRecord user : reader) {
System.out.println(user);
}
} finally {
reader.close();
}
return 0;
}
Aggregations