Search in sources :

Example 1 with PartitionOutput

use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class PartitionedFileSetTest method createPartition.

// creates a file under the partition path for the given key and adds it to the PartitionedFileSet
// writes the given int into the file and asserts that it exists
private Location createPartition(PartitionedFileSet pfs, PartitionKey key, String fileName, int intToWrite) throws IOException {
    PartitionOutput partitionOutput = pfs.getPartitionOutput(key);
    Location outputLocation = partitionOutput.getLocation().append(fileName);
    Assert.assertFalse(outputLocation.exists());
    try (OutputStream outputStream = outputLocation.getOutputStream()) {
        outputStream.write(intToWrite);
    }
    Assert.assertTrue(outputLocation.exists());
    partitionOutput.addPartition();
    return outputLocation;
}
Also used : PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) Location(org.apache.twill.filesystem.Location)

Example 2 with PartitionOutput

use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.

private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
    PartitionedFileSet pfs = pfsManager.get();
    PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    pfsManager.flush();
    Map<String, String> inputArgs = new HashMap<>();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
    Map<String, String> outputArgs = new HashMap<>();
    PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
    args.put("input", "pfs");
    args.put("output", "pfs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    pfsManager.flush();
    PartitionDetail partition = pfs.getPartition(outputKey);
    Assert.assertNotNull(partition);
    validateFileOutput(partition.getLocation());
    // Cleanup after test completed
    pfs.dropPartition(partitionOutput.getPartitionKey());
    pfs.dropPartition(partition.getPartitionKey());
    pfsManager.flush();
}
Also used : SparkManager(io.cdap.cdap.test.SparkManager) PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput) HashMap(java.util.HashMap) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 3 with PartitionOutput

use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class PartitionConcatenateTest method testConcatenate.

/**
 * 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
 * 2. Execute a partition concatenate operation.
 * 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
 *    the contents of the files remains the same.
 */
@Test
public void testConcatenate() throws Exception {
    String orcPFS = "orcPFS";
    addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
    // 1. create 100 small files in the input FileSet
    DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
    PartitionedFileSet cleanRecords = cleanRecordsManager.get();
    PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
    PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
    Location partitionLocation = partitionOutput.getLocation();
    int numInputFiles = 100;
    List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
    partitionOutput.addPartition();
    Assert.assertEquals(writtenData, getExploreResults(orcPFS));
    // this is a timestamp before concatenating, but after writing the files
    long beforeConcatTime = System.currentTimeMillis();
    List<Location> dataFiles = listFilteredChildren(partitionLocation);
    // each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
    // being used
    Assert.assertEquals(numInputFiles, dataFiles.size());
    for (Location dataFile : dataFiles) {
        // all the files should have a lastModified smaller than now
        Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
    }
    // 2. run the concatenate operation
    cleanRecords.concatenatePartition(outputPartition).get();
    // 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
    dataFiles = listFilteredChildren(partitionLocation);
    Assert.assertTrue(dataFiles.size() < numInputFiles);
    // should have a lastModified larger than now
    Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
    // even though the files were concatenated, the explore results should be unchanged
    Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}
Also used : PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) OrcNewOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcNewOutputFormat) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 4 with PartitionOutput

use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class DynamicPartitionerWithAvroTest method writeFile.

private void writeFile(PartitionedFileSet pfs, PartitionKey key) throws IOException {
    PartitionOutput partitionOutput = pfs.getPartitionOutput(key);
    partitionOutput.getLocation().mkdirs();
    partitionOutput.getLocation().append("file").createNew();
    partitionOutput.setMetadata(ImmutableMap.of("file", "file"));
    partitionOutput.addPartition();
}
Also used : PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput)

Example 5 with PartitionOutput

use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class PartitionedFileSetTest method testPartitionMetadata.

@Test
public void testPartitionMetadata() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            PartitionKey partitionKey = PartitionKey.builder().addIntField("i", 42).addLongField("l", 17L).addStringField("s", "x").build();
            ImmutableMap<String, String> metadata = ImmutableMap.of("key1", "value", "key2", "value2", "key3", "value2");
            PartitionOutput partitionOutput = dataset.getPartitionOutput(partitionKey);
            partitionOutput.setMetadata(metadata);
            partitionOutput.addPartition();
            PartitionDetail partitionDetail = dataset.getPartition(partitionKey);
            Assert.assertNotNull(partitionDetail);
            Assert.assertEquals(metadata, partitionDetail.getMetadata().asMap());
        }
    });
}
Also used : PartitionOutput(io.cdap.cdap.api.dataset.lib.PartitionOutput) TransactionAware(org.apache.tephra.TransactionAware) PartitionKey(io.cdap.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(io.cdap.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionDetail(io.cdap.cdap.api.dataset.lib.PartitionDetail) DataSetException(io.cdap.cdap.api.dataset.DataSetException) PartitionNotFoundException(io.cdap.cdap.api.dataset.PartitionNotFoundException) PartitionAlreadyExistsException(io.cdap.cdap.api.dataset.lib.PartitionAlreadyExistsException) IOException(java.io.IOException) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Aggregations

PartitionOutput (io.cdap.cdap.api.dataset.lib.PartitionOutput)10 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)7 Test (org.junit.Test)6 DataSetException (io.cdap.cdap.api.dataset.DataSetException)5 PartitionDetail (io.cdap.cdap.api.dataset.lib.PartitionDetail)5 PartitionKey (io.cdap.cdap.api.dataset.lib.PartitionKey)5 Location (org.apache.twill.filesystem.Location)5 PartitionNotFoundException (io.cdap.cdap.api.dataset.PartitionNotFoundException)4 PartitionAlreadyExistsException (io.cdap.cdap.api.dataset.lib.PartitionAlreadyExistsException)4 IOException (java.io.IOException)4 TransactionAware (org.apache.tephra.TransactionAware)4 TransactionExecutor (org.apache.tephra.TransactionExecutor)4 ImmutableMap (com.google.common.collect.ImmutableMap)2 TimePartitionedFileSet (io.cdap.cdap.api.dataset.lib.TimePartitionedFileSet)2 HashMap (java.util.HashMap)2 Predicate (io.cdap.cdap.api.Predicate)1 PartitionFilter (io.cdap.cdap.api.dataset.lib.PartitionFilter)1 ApplicationManager (io.cdap.cdap.test.ApplicationManager)1 MapReduceManager (io.cdap.cdap.test.MapReduceManager)1 SparkManager (io.cdap.cdap.test.SparkManager)1