use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionedFileSetTest method testRollbackOfPartitionDelete.
@Test
public void testRollbackOfPartitionDelete() throws Exception {
PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance);
TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs);
txContext.start();
// write 1 to the first file
Location outputLocation = createPartition(pfs, PARTITION_KEY, "file", 1);
Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
Assert.assertTrue(pfs.getPartition(PARTITION_KEY).getLocation().exists());
txContext.finish();
txContext.start();
pfs.dropPartition(PARTITION_KEY);
Assert.assertNull(pfs.getPartition(PARTITION_KEY));
Assert.assertFalse(outputLocation.exists());
// create a new partition with the same partition key (same relative path for the partition)
// write 2 to the second file
Location outputLocation2 = createPartition(pfs, PARTITION_KEY, "file", 2);
Assert.assertTrue(outputLocation2.exists());
txContext.abort();
// since the previous transaction aborted, the partition and its files should still exist
txContext.start();
Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
Assert.assertTrue(outputLocation.exists());
try (InputStream inputStream = outputLocation.getInputStream()) {
// should be 1, written by the first partition, not 2 (which was written by the second partition)
Assert.assertEquals(1, inputStream.read());
// should be nothing else in the file
Assert.assertEquals(0, inputStream.available());
}
txContext.finish();
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class SportResultsTest method testPartitionedCounting.
@Test
public void testPartitionedCounting() throws Exception {
// deploy the application and start the upload service
ApplicationManager appManager = deployApplication(SportResults.class);
ServiceManager serviceManager = appManager.getServiceManager("UploadService").start();
serviceManager.waitForStatus(true);
// upload a few dummy results
URL url = serviceManager.getServiceURL();
uploadResults(url, "fantasy", 2014, FANTASY_2014);
uploadResults(url, "fantasy", 2015, FANTASY_2015);
uploadResults(url, "critters", 2014, CRITTERS_2014);
// start a map/reduce that counts all seasons for the fantasy league
MapReduceManager mrManager = appManager.getMapReduceManager("ScoreCounter").start(ImmutableMap.of("league", "fantasy"));
// should be much faster, though
mrManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// validate the output by reading directly from the file set
DataSetManager<PartitionedFileSet> dataSetManager = getDataset("totals");
PartitionedFileSet totals = dataSetManager.get();
PartitionDetail partitionDetail = totals.getPartition(PartitionKey.builder().addStringField("league", "fantasy").build());
Assert.assertNotNull(partitionDetail);
Location location = partitionDetail.getLocation();
// find the part file that has the actual results
Assert.assertTrue(location.isDirectory());
for (Location file : location.list()) {
if (file.getName().startsWith("part")) {
location = file;
}
}
BufferedReader reader = new BufferedReader(new InputStreamReader(location.getInputStream(), "UTF-8"));
// validate each line
Map<String, String[]> expected = ImmutableMap.of("My Team", new String[] { "My Team", "2", "0", "1", "53", "65" }, "Your Team", new String[] { "Your Team", "1", "0", "2", "63", "60" }, "Other Team", new String[] { "Other Team", "1", "0", "1", "40", "31" });
while (true) {
String line = reader.readLine();
if (line == null) {
break;
}
String[] fields = line.split(",");
Assert.assertArrayEquals(expected.get(fields[0]), fields);
}
// verify using SQL
// query with SQL
Connection connection = getQueryClient();
ResultSet results = connection.prepareStatement("SELECT wins, ties, losses, scored, conceded " + "FROM totals WHERE team = 'My Team' AND league = 'fantasy'").executeQuery();
// should return only one row, with correct time fields
Assert.assertTrue(results.next());
Assert.assertEquals(2, results.getInt(1));
Assert.assertEquals(0, results.getInt(2));
Assert.assertEquals(1, results.getInt(3));
Assert.assertEquals(53, results.getInt(4));
Assert.assertEquals(65, results.getInt(5));
Assert.assertFalse(results.next());
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ScoreCounter method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Job job = context.getHadoopJob();
job.setMapperClass(ResultsMapper.class);
job.setReducerClass(TeamCounter.class);
job.setNumReduceTasks(1);
String league = context.getRuntimeArguments().get("league");
Preconditions.checkNotNull(league);
// Configure the input to read all seasons for the league
Map<String, String> inputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build());
context.addInput(Input.ofDataset("results", inputArgs));
// Each run writes its output to a partition for the league
Map<String, String> outputArgs = Maps.newHashMap();
PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
context.addOutput(Output.ofDataset("totals", outputArgs));
// used only for logging:
PartitionedFileSet input = context.getDataset("results", inputArgs);
PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs);
String outputPath = FileSetArguments.getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments());
LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath);
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class HiveExploreServiceFileSetTestRun method testPartitionedAvroSchemaUpdate.
@Test
public void testPartitionedAvroSchemaUpdate() throws Exception {
final DatasetId datasetId = NAMESPACE_ID.dataset("avroupd");
final String tableName = getDatasetHiveName(datasetId);
// create a time partitioned file set
datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
// Accessing dataset instance to perform data operations
PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
Assert.assertNotNull(partitioned);
FileSet fileSet = partitioned.getEmbeddedFileSet();
// add a partition
Location location4 = fileSet.getLocation("file4/nn");
FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
// new partition should have new format, validate with query
List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
// update the partitioned file set
datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".number", "INT", 2, null));
runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
new QueryResult(Lists.<Object>newArrayList("x4", 4))));
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ExploreExecutorHttpHandler method doPartitionOperation.
private void doPartitionOperation(FullHttpRequest request, HttpResponder responder, DatasetId datasetId, PartitionOperation partitionOperation) {
try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
Dataset dataset;
try {
dataset = datasetInstantiator.getDataset(datasetId);
} catch (Exception e) {
LOG.error("Exception instantiating dataset {}.", datasetId, e);
responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
return;
}
try {
if (!(dataset instanceof PartitionedFileSet)) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
return;
}
Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
Reader reader = new InputStreamReader(new ByteBufInputStream(request.content()));
Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
}.getType());
PartitionKey partitionKey;
try {
partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
} catch (Exception e) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
return;
}
if (partitionKey == null) {
responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
return;
}
QueryHandle handle = partitionOperation.submitOperation(partitionKey, properties);
if (handle == null) {
return;
}
JsonObject json = new JsonObject();
json.addProperty("handle", handle.getHandle());
responder.sendJson(HttpResponseStatus.OK, json.toString());
} finally {
Closeables.closeQuietly(dataset);
}
} catch (Throwable e) {
LOG.error("Got exception:", e);
responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
}
}
Aggregations