Examples with PartitionedFileSet - co.cask.cdap.api.dataset.lib.PartitionedFileSet

Example 26 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionedFileSetTest method testRollbackOfPartitionDelete.

@Test
public void testRollbackOfPartitionDelete() throws Exception {
    PartitionedFileSet pfs = dsFrameworkUtil.getInstance(pfsInstance);
    TransactionContext txContext = new TransactionContext(txClient, (TransactionAware) pfs);
    txContext.start();
    // write 1 to the first file
    Location outputLocation = createPartition(pfs, PARTITION_KEY, "file", 1);
    Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
    Assert.assertTrue(pfs.getPartition(PARTITION_KEY).getLocation().exists());
    txContext.finish();
    txContext.start();
    pfs.dropPartition(PARTITION_KEY);
    Assert.assertNull(pfs.getPartition(PARTITION_KEY));
    Assert.assertFalse(outputLocation.exists());
    // create a new partition with the same partition key (same relative path for the partition)
    // write 2 to the second file
    Location outputLocation2 = createPartition(pfs, PARTITION_KEY, "file", 2);
    Assert.assertTrue(outputLocation2.exists());
    txContext.abort();
    // since the previous transaction aborted, the partition and its files should still exist
    txContext.start();
    Assert.assertNotNull(pfs.getPartition(PARTITION_KEY));
    Assert.assertTrue(outputLocation.exists());
    try (InputStream inputStream = outputLocation.getInputStream()) {
        // should be 1, written by the first partition, not 2 (which was written by the second partition)
        Assert.assertEquals(1, inputStream.read());
        // should be nothing else in the file
        Assert.assertEquals(0, inputStream.available());
    }
    txContext.finish();
}

Also used : TransactionContext(org.apache.tephra.TransactionContext) InputStream(java.io.InputStream) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 27 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class SportResultsTest method testPartitionedCounting.

@Test
public void testPartitionedCounting() throws Exception {
    // deploy the application and start the upload service
    ApplicationManager appManager = deployApplication(SportResults.class);
    ServiceManager serviceManager = appManager.getServiceManager("UploadService").start();
    serviceManager.waitForStatus(true);
    // upload a few dummy results
    URL url = serviceManager.getServiceURL();
    uploadResults(url, "fantasy", 2014, FANTASY_2014);
    uploadResults(url, "fantasy", 2015, FANTASY_2015);
    uploadResults(url, "critters", 2014, CRITTERS_2014);
    // start a map/reduce that counts all seasons for the fantasy league
    MapReduceManager mrManager = appManager.getMapReduceManager("ScoreCounter").start(ImmutableMap.of("league", "fantasy"));
    // should be much faster, though
    mrManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    // validate the output by reading directly from the file set
    DataSetManager<PartitionedFileSet> dataSetManager = getDataset("totals");
    PartitionedFileSet totals = dataSetManager.get();
    PartitionDetail partitionDetail = totals.getPartition(PartitionKey.builder().addStringField("league", "fantasy").build());
    Assert.assertNotNull(partitionDetail);
    Location location = partitionDetail.getLocation();
    // find the part file that has the actual results
    Assert.assertTrue(location.isDirectory());
    for (Location file : location.list()) {
        if (file.getName().startsWith("part")) {
            location = file;
        }
    }
    BufferedReader reader = new BufferedReader(new InputStreamReader(location.getInputStream(), "UTF-8"));
    // validate each line
    Map<String, String[]> expected = ImmutableMap.of("My Team", new String[] { "My Team", "2", "0", "1", "53", "65" }, "Your Team", new String[] { "Your Team", "1", "0", "2", "63", "60" }, "Other Team", new String[] { "Other Team", "1", "0", "1", "40", "31" });
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }
        String[] fields = line.split(",");
        Assert.assertArrayEquals(expected.get(fields[0]), fields);
    }
    // verify using SQL
    // query with SQL
    Connection connection = getQueryClient();
    ResultSet results = connection.prepareStatement("SELECT wins, ties, losses, scored, conceded " + "FROM totals WHERE team = 'My Team' AND league = 'fantasy'").executeQuery();
    // should return only one row, with correct time fields
    Assert.assertTrue(results.next());
    Assert.assertEquals(2, results.getInt(1));
    Assert.assertEquals(0, results.getInt(2));
    Assert.assertEquals(1, results.getInt(3));
    Assert.assertEquals(53, results.getInt(4));
    Assert.assertEquals(65, results.getInt(5));
    Assert.assertFalse(results.next());
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) MapReduceManager(co.cask.cdap.test.MapReduceManager) InputStreamReader(java.io.InputStreamReader) HttpURLConnection(java.net.HttpURLConnection) Connection(java.sql.Connection) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) URL(java.net.URL) ServiceManager(co.cask.cdap.test.ServiceManager) BufferedReader(java.io.BufferedReader) ResultSet(java.sql.ResultSet) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 28 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ScoreCounter method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(ResultsMapper.class);
    job.setReducerClass(TeamCounter.class);
    job.setNumReduceTasks(1);
    String league = context.getRuntimeArguments().get("league");
    Preconditions.checkNotNull(league);
    // Configure the input to read all seasons for the league
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build());
    context.addInput(Input.ofDataset("results", inputArgs));
    // Each run writes its output to a partition for the league
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    context.addOutput(Output.ofDataset("totals", outputArgs));
    // used only for logging:
    PartitionedFileSet input = context.getDataset("results", inputArgs);
    PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs);
    String outputPath = FileSetArguments.getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments());
    LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath);
}

Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Job(org.apache.hadoop.mapreduce.Job)

Example 29 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class HiveExploreServiceFileSetTestRun method testPartitionedAvroSchemaUpdate.

@Test
public void testPartitionedAvroSchemaUpdate() throws Exception {
    final DatasetId datasetId = NAMESPACE_ID.dataset("avroupd");
    final String tableName = getDatasetHiveName(datasetId);
    // create a time partitioned file set
    datasetFramework.addInstance(PartitionedFileSet.class.getName(), datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", SCHEMA.toString()).build());
    // Accessing dataset instance to perform data operations
    PartitionedFileSet partitioned = datasetFramework.getDataset(datasetId, DatasetDefinition.NO_ARGUMENTS, null);
    Assert.assertNotNull(partitioned);
    FileSet fileSet = partitioned.getEmbeddedFileSet();
    // add a partition
    Location location4 = fileSet.getLocation("file4/nn");
    FileWriterHelper.generateAvroFile(location4.getOutputStream(), "x", 4, 5);
    addPartition(partitioned, PartitionKey.builder().addIntField("number", 4).build(), "file4");
    // new partition should have new format, validate with query
    List<ColumnDesc> expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".value", "STRING", 2, null), new ColumnDesc(tableName + ".number", "INT", 3, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
    new QueryResult(Lists.<Object>newArrayList("x4", "#4", 4))));
    // update the partitioned file set
    datasetFramework.updateInstance(datasetId, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addIntField("number").build()).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", K_SCHEMA.toString()).build());
    expectedColumns = Lists.newArrayList(new ColumnDesc(tableName + ".key", "STRING", 1, null), new ColumnDesc(tableName + ".number", "INT", 2, null));
    runCommand(NAMESPACE_ID, "SELECT * FROM " + tableName + " WHERE number=4", true, expectedColumns, Lists.newArrayList(// avro file has key=x4, value=#4
    new QueryResult(Lists.<Object>newArrayList("x4", 4))));
}

Also used : QueryResult(co.cask.cdap.proto.QueryResult) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) FileSet(co.cask.cdap.api.dataset.lib.FileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) ColumnDesc(co.cask.cdap.proto.ColumnDesc) DatasetId(co.cask.cdap.proto.id.DatasetId) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 30 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ExploreExecutorHttpHandler method doPartitionOperation.

private void doPartitionOperation(FullHttpRequest request, HttpResponder responder, DatasetId datasetId, PartitionOperation partitionOperation) {
    try (SystemDatasetInstantiator datasetInstantiator = datasetInstantiatorFactory.create()) {
        Dataset dataset;
        try {
            dataset = datasetInstantiator.getDataset(datasetId);
        } catch (Exception e) {
            LOG.error("Exception instantiating dataset {}.", datasetId, e);
            responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, "Exception instantiating dataset " + datasetId);
            return;
        }
        try {
            if (!(dataset instanceof PartitionedFileSet)) {
                responder.sendString(HttpResponseStatus.BAD_REQUEST, "not a partitioned dataset.");
                return;
            }
            Partitioning partitioning = ((PartitionedFileSet) dataset).getPartitioning();
            Reader reader = new InputStreamReader(new ByteBufInputStream(request.content()));
            Map<String, String> properties = GSON.fromJson(reader, new TypeToken<Map<String, String>>() {
            }.getType());
            PartitionKey partitionKey;
            try {
                partitionKey = PartitionedFileSetArguments.getOutputPartitionKey(properties, partitioning);
            } catch (Exception e) {
                responder.sendString(HttpResponseStatus.BAD_REQUEST, "invalid partition key: " + e.getMessage());
                return;
            }
            if (partitionKey == null) {
                responder.sendString(HttpResponseStatus.BAD_REQUEST, "no partition key was given.");
                return;
            }
            QueryHandle handle = partitionOperation.submitOperation(partitionKey, properties);
            if (handle == null) {
                return;
            }
            JsonObject json = new JsonObject();
            json.addProperty("handle", handle.getHandle());
            responder.sendJson(HttpResponseStatus.OK, json.toString());
        } finally {
            Closeables.closeQuietly(dataset);
        }
    } catch (Throwable e) {
        LOG.error("Got exception:", e);
        responder.sendString(HttpResponseStatus.INTERNAL_SERVER_ERROR, e.getMessage());
    }
}

Also used : InputStreamReader(java.io.InputStreamReader) Dataset(co.cask.cdap.api.dataset.Dataset) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) JsonObject(com.google.gson.JsonObject) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) ByteBufInputStream(io.netty.buffer.ByteBufInputStream) BadRequestException(co.cask.cdap.common.BadRequestException) ExploreException(co.cask.cdap.explore.service.ExploreException) SQLException(java.sql.SQLException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) JsonSyntaxException(com.google.gson.JsonSyntaxException) UnsupportedTypeException(co.cask.cdap.api.data.schema.UnsupportedTypeException) IOException(java.io.IOException) Partitioning(co.cask.cdap.api.dataset.lib.Partitioning) SystemDatasetInstantiator(co.cask.cdap.data.dataset.SystemDatasetInstantiator) TypeToken(com.google.common.reflect.TypeToken) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) QueryHandle(co.cask.cdap.proto.QueryHandle)

Aggregations

PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)65 Test (org.junit.Test)39 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)32 Location (org.apache.twill.filesystem.Location)25 TransactionAware (org.apache.tephra.TransactionAware)24 TransactionExecutor (org.apache.tephra.TransactionExecutor)24 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)18 IOException (java.io.IOException)17 DataSetException (co.cask.cdap.api.dataset.DataSetException)12 FileSet (co.cask.cdap.api.dataset.lib.FileSet)12 HashSet (java.util.HashSet)12 List (java.util.List)12 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)11 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)9 ImmutableList (com.google.common.collect.ImmutableList)9 ArrayList (java.util.ArrayList)9 HashMap (java.util.HashMap)9