Search in sources :

Example 1 with PartitionOutput

use of co.cask.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class PartitionRollbackTestRun method testPFSRollback.

/*
   * This tests all the following cases:
   *
   *  1. addPartition(location) fails because partition already exists
   *  2. addPartition(location) fails because Hive partition already exists
   *  3. addPartition(location) succeeds but transaction fails
   *  4. getPartitionOutput() fails because partition already exists
   *  5. partitionOutput.addPartition() fails because Hive partition already exists
   *  6. partitionOutput.addPartition() succeeds but transaction fails
   *  7. mapreduce writing partition fails because location already exists
   *  8. mapreduce writing partition fails because partition already exists
   *  9. mapreduce writing partition fails because Hive partition already exists
   *  10. mapreduce writing dynamic partition fails because location already exists
   *  11. mapreduce writing dynamic partition fails because partition already exists
   *  12. mapreduce writing dynamic partition fails because Hive partition already exists
   *  13. multi-output mapreduce writing partition fails because location already exists
   *  13a. first output fails, other output must rollback 0 and 5
   *  13b. second output fails, first output must rollback 0 and 5
   *  14. multi-output mapreduce writing partition fails because partition already exists
   *  14a. first output fails, other output must rollback partition 5
   *  14b. second output fails, first output must rollback partition 5
   *  15. multi-output mapreduce writing partition fails because Hive partition already exists
   *  15a. first output fails, other output must rollback partitions 0 and 5
   *  15b. second output fails, first output must rollback partitions 0 and 5
   *
   * For all these cases, we validate that existing files and partitions are preserved, and newly
   * added files and partitions are rolled back.
   */
@Test
public void testPFSRollback() throws Exception {
    ApplicationManager appManager = deployApplication(AppWritingToPartitioned.class);
    MapReduceManager mrManager = appManager.getMapReduceManager(MAPREDUCE);
    int numRuns = 0;
    Validator pfsValidator = new Validator(PFS);
    Validator otherValidator = new Validator(OTHER);
    final UnitTestManager.UnitTestDatasetManager<PartitionedFileSet> pfsManager = pfsValidator.getPfsManager();
    final PartitionedFileSet pfs = pfsManager.get();
    final PartitionedFileSet other = otherValidator.getPfsManager().get();
    final String path3 = pfsValidator.getRelativePath3();
    // 1. addPartition(location) fails because partition already exists
    try {
        pfsManager.execute(new Runnable() {

            @Override
            public void run() {
                pfs.addPartition(KEY_1, path3);
            }
        });
        Assert.fail("Expected tx to fail because partition for number=1 already exists");
    } catch (TransactionFailureException e) {
    // expected
    }
    pfsValidator.validate();
    // 2. addPartition(location) fails because Hive partition already exists
    try {
        pfsManager.execute(new Runnable() {

            @Override
            public void run() {
                pfs.addPartition(KEY_4, path3);
            }
        });
        Assert.fail("Expected tx to fail because hive partition for number=1 already exists");
    } catch (TransactionFailureException e) {
    // expected
    }
    pfsValidator.validate();
    // 3. addPartition(location) succeeds but transaction fails
    try {
        pfsManager.execute(new Runnable() {

            @Override
            public void run() {
                pfs.addPartition(KEY_3, path3);
                throw new RuntimeException("fail the tx");
            }
        });
        Assert.fail("Expected tx to fail because it threw a runtime exception");
    } catch (TransactionFailureException e) {
    // expected
    }
    pfsValidator.validate();
    // 4. partitionOutput.getPartitionOutput() fails because partition already exists
    try {
        pfs.getPartitionOutput(KEY_1);
        Assert.fail("Expected getPartitionOutput to fail, because the partition already exists.");
    } catch (DataSetException expected) {
    }
    pfsValidator.validate();
    // 5. partitionOutput.addPartition() fails because Hive partition already exists
    final PartitionOutput output4x = pfs.getPartitionOutput(KEY_4);
    final Location location4x = output4x.getLocation();
    try (Writer writer = new OutputStreamWriter(location4x.append("file").getOutputStream())) {
        writer.write("4x,4x\n");
    }
    try {
        pfsManager.execute(new Runnable() {

            @Override
            public void run() {
                output4x.addPartition();
            }
        });
        Assert.fail("Expected tx to fail because hive partition for number=4 already exists");
    } catch (TransactionFailureException e) {
    // expected
    }
    pfsValidator.validate();
    Assert.assertFalse(location4x.exists());
    // 6. partitionOutput.addPartition() succeeds but transaction fails
    final PartitionOutput output5x = pfs.getPartitionOutput(KEY_5);
    final Location location5x = output5x.getLocation();
    try (Writer writer = new OutputStreamWriter(location5x.append("file").getOutputStream())) {
        writer.write("5x,5x\n");
    }
    try {
        pfsManager.execute(new Runnable() {

            @Override
            public void run() {
                output5x.addPartition();
                throw new RuntimeException("fail the tx");
            }
        });
        Assert.fail("Expected tx to fail because it threw a runtime exception");
    } catch (TransactionFailureException e) {
    // expected
    }
    pfsValidator.validate();
    Assert.assertFalse(location5x.exists());
    // 7. mapreduce writing partition fails because location already exists
    mrManager.start(ImmutableMap.of(PFS_OUT, "1", "input.text", "1x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    // 8. mapreduce writing partition fails because partition already exists
    mrManager.start(ImmutableMap.of(PFS_OUT, "2", "input.text", "2x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_2).getLocation().exists());
    // 9. mapreduce writing partition fails because Hive partition already exists
    mrManager.start(ImmutableMap.of(PFS_OUT, "4", "input.text", "4x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
    // 10. mapreduce writing dynamic partition fails because location already exists
    mrManager.start(ImmutableMap.of("input.text", "3x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
    // 11. mapreduce writing dynamic partition fails because partition already exists
    mrManager.start(ImmutableMap.of("input.text", "2x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_2).getLocation().exists());
    Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
    // 12. mapreduce writing dynamic partition fails because Hive partition already exists
    mrManager.start(ImmutableMap.of("input.text", "0x 4x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
    Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
    Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
    // 13. multi-output mapreduce writing partition fails because location already exists
    // 13a. first output fails, other output must rollback 0 and 5
    mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "1", "input.text", "0x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    otherValidator.validate();
    Assert.assertFalse(other.getPartitionOutput(KEY_0).getLocation().exists());
    Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
    // 13b. second output fails, first output must rollback 0 and 5
    mrManager.start(ImmutableMap.of("output.datasets", BOTH, OTHER_OUT, "1", "input.text", "0x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    otherValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
    Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
    // 14. multi-output mapreduce writing partition fails because partition already exists
    // 14a. first output fails, other output must rollback partition 5
    mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "2", OTHER_OUT, "5", "input.text", "2x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    otherValidator.validate();
    Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
    // 14b. second output fails, first output must rollback partition 5
    mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "5", OTHER_OUT, "2", "input.text", "2x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    otherValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
    // 15. multi-output mapreduce writing partition fails because Hive partition already exists
    // 15a. first output fails, other output must rollback partitions 0 and 5
    mrManager.start(ImmutableMap.of("output.datasets", BOTH, PFS_OUT, "4", "input.text", "0x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    otherValidator.validate();
    Assert.assertFalse(pfs.getPartitionOutput(KEY_4).getLocation().exists());
    Assert.assertFalse(other.getPartitionOutput(KEY_0).getLocation().exists());
    Assert.assertFalse(other.getPartitionOutput(KEY_5).getLocation().exists());
    // 15b. second output fails, first output must rollback partitions 0 and 5
    mrManager.start(ImmutableMap.of("output.datasets", BOTH, OTHER_OUT, "4", "input.text", "0x 5x"));
    mrManager.waitForRuns(ProgramRunStatus.FAILED, ++numRuns, 2, TimeUnit.MINUTES);
    pfsValidator.validate();
    otherValidator.validate();
    Assert.assertFalse(other.getPartitionOutput(KEY_4).getLocation().exists());
    Assert.assertFalse(pfs.getPartitionOutput(KEY_0).getLocation().exists());
    Assert.assertFalse(pfs.getPartitionOutput(KEY_5).getLocation().exists());
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) MapReduceManager(co.cask.cdap.test.MapReduceManager) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionFailureException(org.apache.tephra.TransactionFailureException) DataSetException(co.cask.cdap.api.dataset.DataSetException) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) UnitTestManager(co.cask.cdap.test.UnitTestManager) OutputStreamWriter(java.io.OutputStreamWriter) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 2 with PartitionOutput

use of co.cask.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class SparkFileSetTestRun method addTimePartition.

private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long inputTime) throws IOException, TransactionFailureException, InterruptedException {
    TimePartitionedFileSet tpfs = tpfsManager.get();
    PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime);
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    tpfsManager.flush();
}
Also used : PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Location(org.apache.twill.filesystem.Location)

Example 3 with PartitionOutput

use of co.cask.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.

private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
    DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
    PartitionedFileSet pfs = pfsManager.get();
    PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
    Location location = partitionOutput.getLocation();
    prepareFileInput(location);
    partitionOutput.addPartition();
    pfsManager.flush();
    Map<String, String> inputArgs = new HashMap<>();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
    Map<String, String> outputArgs = new HashMap<>();
    PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    Map<String, String> args = new HashMap<>();
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
    args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
    args.put("input", "pfs");
    args.put("output", "pfs");
    SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
    sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
    pfsManager.flush();
    PartitionDetail partition = pfs.getPartition(outputKey);
    Assert.assertNotNull(partition);
    validateFileOutput(partition.getLocation());
    // Cleanup after test completed
    pfs.dropPartition(partitionOutput.getPartitionKey());
    pfs.dropPartition(partition.getPartitionKey());
    pfsManager.flush();
}
Also used : SparkManager(co.cask.cdap.test.SparkManager) PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) Location(org.apache.twill.filesystem.Location)

Example 4 with PartitionOutput

use of co.cask.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class PartitionConcatenateTest method testConcatenate.

/**
 * 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
 * 2. Execute a partition concatenate operation.
 * 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
 *    the contents of the files remains the same.
 */
@Test
public void testConcatenate() throws Exception {
    String orcPFS = "orcPFS";
    addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
    // 1. create 100 small files in the input FileSet
    DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
    PartitionedFileSet cleanRecords = cleanRecordsManager.get();
    PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
    PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
    Location partitionLocation = partitionOutput.getLocation();
    int numInputFiles = 100;
    List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
    partitionOutput.addPartition();
    Assert.assertEquals(writtenData, getExploreResults(orcPFS));
    // this is a timestamp before concatenating, but after writing the files
    long beforeConcatTime = System.currentTimeMillis();
    List<Location> dataFiles = listFilteredChildren(partitionLocation);
    // each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
    // being used
    Assert.assertEquals(numInputFiles, dataFiles.size());
    for (Location dataFile : dataFiles) {
        // all the files should have a lastModified smaller than now
        Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
    }
    // 2. run the concatenate operation
    cleanRecords.concatenatePartition(outputPartition).get();
    // 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
    dataFiles = listFilteredChildren(partitionLocation);
    Assert.assertTrue(dataFiles.size() < numInputFiles);
    // should have a lastModified larger than now
    Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
    // even though the files were concatenated, the explore results should be unchanged
    Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}
Also used : PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) OrcNewOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcNewOutputFormat) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 5 with PartitionOutput

use of co.cask.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.

the class PartitionedFileSetTest method testPartitionCreationTime.

@Test
public void testPartitionCreationTime() throws Exception {
    final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
    dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {

        @Override
        public void apply() throws Exception {
            PartitionOutput partitionOutput = dataset.getPartitionOutput(PARTITION_KEY);
            long beforeTime = System.currentTimeMillis();
            partitionOutput.addPartition();
            long afterTime = System.currentTimeMillis();
            PartitionDetail partitionDetail = dataset.getPartition(PARTITION_KEY);
            Assert.assertNotNull(partitionDetail);
            long creationTime = partitionDetail.getMetadata().getCreationTime();
            long lastModificationTime = partitionDetail.getMetadata().lastModificationTime();
            // lastModificationTime time should be equal to creationTime for a partition that has not been appended to
            Assert.assertEquals(creationTime, lastModificationTime);
            Assert.assertTrue(creationTime >= beforeTime && creationTime <= afterTime);
        }
    });
}
Also used : PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) TransactionAware(org.apache.tephra.TransactionAware) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) TransactionExecutor(org.apache.tephra.TransactionExecutor) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) PartitionNotFoundException(co.cask.cdap.api.dataset.PartitionNotFoundException) PartitionAlreadyExistsException(co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException) IOException(java.io.IOException) DataSetException(co.cask.cdap.api.dataset.DataSetException) Test(org.junit.Test)

Aggregations

PartitionOutput (co.cask.cdap.api.dataset.lib.PartitionOutput)10 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)7 Test (org.junit.Test)6 DataSetException (co.cask.cdap.api.dataset.DataSetException)5 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)5 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)5 Location (org.apache.twill.filesystem.Location)5 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)4 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)4 IOException (java.io.IOException)4 TransactionAware (org.apache.tephra.TransactionAware)4 TransactionExecutor (org.apache.tephra.TransactionExecutor)4 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 HashMap (java.util.HashMap)2 Predicate (co.cask.cdap.api.Predicate)1 PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)1 ApplicationManager (co.cask.cdap.test.ApplicationManager)1 MapReduceManager (co.cask.cdap.test.MapReduceManager)1 SparkManager (co.cask.cdap.test.SparkManager)1