Examples with PartitionedFileSet - co.cask.cdap.api.dataset.lib.PartitionedFileSet

Example 6 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class FileUploadServiceTestRun method testFileUploadService.

@Test
public void testFileUploadService() throws Exception {
    ApplicationManager appManager = deployApplication(FileUploadApp.class);
    // Start the service
    ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
    serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
    try {
        // Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
        URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
        // Upload with wrong MD5, should get 400.
        byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
        Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
        long beforeUploadTime = System.currentTimeMillis();
        // Upload with right MD5, should get 200
        Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, BaseEncoding.base64().encode(Hashing.md5().hashBytes(content).asBytes()), 20));
        // Inspect the partitioned file set and verify the content
        PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
        PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
        Assert.assertNotNull(partition);
        // Verify a notification should have been published for the new partition
        List<Notification> notifications = getDataNotifications(beforeUploadTime);
        // Should have one message
        Assert.assertEquals(1, notifications.size());
        verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
        // There should be one file under the partition directory
        List<Location> locations = partition.getLocation().list();
        Assert.assertEquals(1, locations.size());
        Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
        // Verify the tracking table of chunks sizes
        KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
        CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
        // Sum up all chunks sizes as being tracked by the tracking table.
        long sum = 0;
        int iterSize = 0;
        while (iter.hasNext()) {
            KeyValue<byte[], byte[]> kv = iter.next();
            sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
            iterSize++;
        }
        // The iterator should have size >= 2, since we uses different chunk size for two different upload
        Assert.assertTrue(iterSize >= 2);
        // The sum of all chunks sizes should be the same as the
        // content size * 2 (since we have one failure, one success upload)
        Assert.assertEquals(content.length * 2, sum);
    } finally {
        serviceManager.stop();
        serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
    }
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) URI(java.net.URI) Notification(co.cask.cdap.proto.Notification) ServiceManager(co.cask.cdap.test.ServiceManager) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 7 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class PartitionConcatenateTest method testConcatenate.

/**
 * 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
 * 2. Execute a partition concatenate operation.
 * 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
 *    the contents of the files remains the same.
 */
@Test
public void testConcatenate() throws Exception {
    String orcPFS = "orcPFS";
    addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
    // 1. create 100 small files in the input FileSet
    DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
    PartitionedFileSet cleanRecords = cleanRecordsManager.get();
    PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
    PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
    Location partitionLocation = partitionOutput.getLocation();
    int numInputFiles = 100;
    List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
    partitionOutput.addPartition();
    Assert.assertEquals(writtenData, getExploreResults(orcPFS));
    // this is a timestamp before concatenating, but after writing the files
    long beforeConcatTime = System.currentTimeMillis();
    List<Location> dataFiles = listFilteredChildren(partitionLocation);
    // each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
    // being used
    Assert.assertEquals(numInputFiles, dataFiles.size());
    for (Location dataFile : dataFiles) {
        // all the files should have a lastModified smaller than now
        Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
    }
    // 2. run the concatenate operation
    cleanRecords.concatenatePartition(outputPartition).get();
    // 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
    dataFiles = listFilteredChildren(partitionLocation);
    Assert.assertTrue(dataFiles.size() < numInputFiles);
    // should have a lastModified larger than now
    Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
    // even though the files were concatenated, the explore results should be unchanged
    Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}

Also used : PartitionOutput(co.cask.cdap.api.dataset.lib.PartitionOutput) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) OrcNewOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcNewOutputFormat) Location(org.apache.twill.filesystem.Location) Test(org.junit.Test)

Example 8 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ClicksAndViewsMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
    context.addInput(Input.ofStream(ClicksAndViews.VIEWS));
    PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
    PartitionKey outputPartitionKey = PartitionedFileSetArguments.getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());
    if (outputPartitionKey == null) {
        outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime()).build();
    }
    Map<String, String> outputArgs = new HashMap<>();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
    context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));
    Job job = context.getHadoopJob();
    job.setMapperClass(ImpressionKeyingMapper.class);
    job.setReducerClass(JoiningReducer.class);
}

Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Job(org.apache.hadoop.mapreduce.Job)

Example 9 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class DataCleansingMapReduceTest method testPartitionConsuming.

@Test
public void testPartitionConsuming() throws Exception {
    ApplicationManager applicationManager = deployApplication(DataCleansing.class);
    ServiceManager serviceManager = applicationManager.getServiceManager(DataCleansingService.NAME).start();
    serviceManager.waitForStatus(true);
    URL serviceURL = serviceManager.getServiceURL();
    // write a set of records to one partition and run the DataCleansingMapReduce job on that one partition
    createPartition(serviceURL, RECORD_SET1);
    // before starting the MR, there are 0 invalid records and 0 valid records, according to metrics
    Assert.assertEquals(0, getValidityMetrics(true));
    Assert.assertEquals(0, getValidityMetrics(false));
    Long now = System.currentTimeMillis();
    ImmutableMap<String, String> args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
    MapReduceManager mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
    mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(RECORD_SET1, true));
    compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(RECORD_SET1, false));
    // assert that some of the records have indeed been filtered
    Assert.assertNotEquals(filterRecords(RECORD_SET1, true), RECORD_SET1);
    Assert.assertNotEquals(filterRecords(RECORD_SET1, false), Collections.<String>emptySet());
    // verify this via metrics
    Assert.assertEquals(1, getValidityMetrics(true));
    Assert.assertEquals(1, getValidityMetrics(false));
    // create two additional partitions
    createPartition(serviceURL, RECORD_SET2);
    createPartition(serviceURL, RECORD_SET3);
    // running the MapReduce job now processes these two new partitions (RECORD_SET1 and RECORD_SET2) and creates a new
    // partition with with the output
    now = System.currentTimeMillis();
    args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
    mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
    mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
    ImmutableSet<String> recordSets2and3 = ImmutableSet.<String>builder().addAll(RECORD_SET2).addAll(RECORD_SET3).build();
    compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(recordSets2and3, true));
    compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(recordSets2and3, false));
    // verify this via metrics
    Assert.assertEquals(1, getValidityMetrics(true));
    Assert.assertEquals(5, getValidityMetrics(false));
    // running the MapReduce job without adding new partitions creates no additional output
    now = System.currentTimeMillis();
    args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
    mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
    mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
    compareData(now, DataCleansing.CLEAN_RECORDS, Collections.<String>emptySet());
    compareData(now, DataCleansing.INVALID_RECORDS, Collections.<String>emptySet());
    // verify that the records were properly partitioned on their zip
    DataSetManager<PartitionedFileSet> cleanRecords = getDataset(DataCleansing.CLEAN_RECORDS);
    PartitionFilter filter = PartitionFilter.builder().addValueCondition("zip", 84125).build();
    Assert.assertEquals(ImmutableSet.of(RECORD1, RECORD4, RECORD6), getDataFromFilter(cleanRecords.get(), filter));
    filter = PartitionFilter.builder().addValueCondition("zip", 84126).build();
    Assert.assertEquals(ImmutableSet.of(RECORD3, RECORD5), getDataFromFilter(cleanRecords.get(), filter));
}

Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) MapReduceManager(co.cask.cdap.test.MapReduceManager) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) ServiceManager(co.cask.cdap.test.ServiceManager) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) URL(java.net.URL) Test(org.junit.Test)

Example 10 with PartitionedFileSet

use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.

the class ClicksAndViewsMapReduceTest method getDataFromFile.

private Set<String> getDataFromFile() throws Exception {
    DataSetManager<PartitionedFileSet> cleanRecords = getDataset(ClicksAndViews.JOINED);
    Set<String> cleanData = new HashSet<>();
    // we configured the MapReduce to write to this partition when starting it
    PartitionDetail partition = cleanRecords.get().getPartition(PartitionKey.builder().addLongField("runtime", OUTPUT_PARTITION_RUNTIME).build());
    Assert.assertNotNull(partition);
    for (Location location : partition.getLocation().list()) {
        if (location.getName().startsWith("part-")) {
            try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
                String line;
                while ((line = bufferedReader.readLine()) != null) {
                    cleanData.add(line);
                }
            }
        }
    }
    return cleanData;
}

Also used : InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) PartitionDetail(co.cask.cdap.api.dataset.lib.PartitionDetail) HashSet(java.util.HashSet) Location(org.apache.twill.filesystem.Location)

Aggregations

PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)65 Test (org.junit.Test)39 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)32 Location (org.apache.twill.filesystem.Location)25 TransactionAware (org.apache.tephra.TransactionAware)24 TransactionExecutor (org.apache.tephra.TransactionExecutor)24 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)18 IOException (java.io.IOException)17 DataSetException (co.cask.cdap.api.dataset.DataSetException)12 FileSet (co.cask.cdap.api.dataset.lib.FileSet)12 HashSet (java.util.HashSet)12 List (java.util.List)12 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)11 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)11 ConcurrentPartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.ConcurrentPartitionConsumer)11 PartitionConsumer (co.cask.cdap.api.dataset.lib.partitioned.PartitionConsumer)11 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)9 ImmutableList (com.google.common.collect.ImmutableList)9 ArrayList (java.util.ArrayList)9 HashMap (java.util.HashMap)9