use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class FileUploadServiceTestRun method testFileUploadService.
@Test
public void testFileUploadService() throws Exception {
ApplicationManager appManager = deployApplication(FileUploadApp.class);
// Start the service
ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
try {
// Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
// Upload with wrong MD5, should get 400.
byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
long beforeUploadTime = System.currentTimeMillis();
// Upload with right MD5, should get 200
Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, BaseEncoding.base64().encode(Hashing.md5().hashBytes(content).asBytes()), 20));
// Inspect the partitioned file set and verify the content
PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
Assert.assertNotNull(partition);
// Verify a notification should have been published for the new partition
List<Notification> notifications = getDataNotifications(beforeUploadTime);
// Should have one message
Assert.assertEquals(1, notifications.size());
verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
// There should be one file under the partition directory
List<Location> locations = partition.getLocation().list();
Assert.assertEquals(1, locations.size());
Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
// Verify the tracking table of chunks sizes
KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
// Sum up all chunks sizes as being tracked by the tracking table.
long sum = 0;
int iterSize = 0;
while (iter.hasNext()) {
KeyValue<byte[], byte[]> kv = iter.next();
sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
iterSize++;
}
// The iterator should have size >= 2, since we uses different chunk size for two different upload
Assert.assertTrue(iterSize >= 2);
// The sum of all chunks sizes should be the same as the
// content size * 2 (since we have one failure, one success upload)
Assert.assertEquals(content.length * 2, sum);
} finally {
serviceManager.stop();
serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
}
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class PartitionConcatenateTest method testConcatenate.
/**
* 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
* 2. Execute a partition concatenate operation.
* 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
* the contents of the files remains the same.
*/
@Test
public void testConcatenate() throws Exception {
String orcPFS = "orcPFS";
addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
// 1. create 100 small files in the input FileSet
DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
PartitionedFileSet cleanRecords = cleanRecordsManager.get();
PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
Location partitionLocation = partitionOutput.getLocation();
int numInputFiles = 100;
List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
partitionOutput.addPartition();
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
// this is a timestamp before concatenating, but after writing the files
long beforeConcatTime = System.currentTimeMillis();
List<Location> dataFiles = listFilteredChildren(partitionLocation);
// each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
// being used
Assert.assertEquals(numInputFiles, dataFiles.size());
for (Location dataFile : dataFiles) {
// all the files should have a lastModified smaller than now
Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
}
// 2. run the concatenate operation
cleanRecords.concatenatePartition(outputPartition).get();
// 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
dataFiles = listFilteredChildren(partitionLocation);
Assert.assertTrue(dataFiles.size() < numInputFiles);
// should have a lastModified larger than now
Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
// even though the files were concatenated, the explore results should be unchanged
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ClicksAndViewsMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
context.addInput(Input.ofStream(ClicksAndViews.VIEWS));
PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
PartitionKey outputPartitionKey = PartitionedFileSetArguments.getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());
if (outputPartitionKey == null) {
outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime()).build();
}
Map<String, String> outputArgs = new HashMap<>();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));
Job job = context.getHadoopJob();
job.setMapperClass(ImpressionKeyingMapper.class);
job.setReducerClass(JoiningReducer.class);
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class DataCleansingMapReduceTest method testPartitionConsuming.
@Test
public void testPartitionConsuming() throws Exception {
ApplicationManager applicationManager = deployApplication(DataCleansing.class);
ServiceManager serviceManager = applicationManager.getServiceManager(DataCleansingService.NAME).start();
serviceManager.waitForStatus(true);
URL serviceURL = serviceManager.getServiceURL();
// write a set of records to one partition and run the DataCleansingMapReduce job on that one partition
createPartition(serviceURL, RECORD_SET1);
// before starting the MR, there are 0 invalid records and 0 valid records, according to metrics
Assert.assertEquals(0, getValidityMetrics(true));
Assert.assertEquals(0, getValidityMetrics(false));
Long now = System.currentTimeMillis();
ImmutableMap<String, String> args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
MapReduceManager mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(RECORD_SET1, true));
compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(RECORD_SET1, false));
// assert that some of the records have indeed been filtered
Assert.assertNotEquals(filterRecords(RECORD_SET1, true), RECORD_SET1);
Assert.assertNotEquals(filterRecords(RECORD_SET1, false), Collections.<String>emptySet());
// verify this via metrics
Assert.assertEquals(1, getValidityMetrics(true));
Assert.assertEquals(1, getValidityMetrics(false));
// create two additional partitions
createPartition(serviceURL, RECORD_SET2);
createPartition(serviceURL, RECORD_SET3);
// running the MapReduce job now processes these two new partitions (RECORD_SET1 and RECORD_SET2) and creates a new
// partition with with the output
now = System.currentTimeMillis();
args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
ImmutableSet<String> recordSets2and3 = ImmutableSet.<String>builder().addAll(RECORD_SET2).addAll(RECORD_SET3).build();
compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(recordSets2and3, true));
compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(recordSets2and3, false));
// verify this via metrics
Assert.assertEquals(1, getValidityMetrics(true));
Assert.assertEquals(5, getValidityMetrics(false));
// running the MapReduce job without adding new partitions creates no additional output
now = System.currentTimeMillis();
args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
compareData(now, DataCleansing.CLEAN_RECORDS, Collections.<String>emptySet());
compareData(now, DataCleansing.INVALID_RECORDS, Collections.<String>emptySet());
// verify that the records were properly partitioned on their zip
DataSetManager<PartitionedFileSet> cleanRecords = getDataset(DataCleansing.CLEAN_RECORDS);
PartitionFilter filter = PartitionFilter.builder().addValueCondition("zip", 84125).build();
Assert.assertEquals(ImmutableSet.of(RECORD1, RECORD4, RECORD6), getDataFromFilter(cleanRecords.get(), filter));
filter = PartitionFilter.builder().addValueCondition("zip", 84126).build();
Assert.assertEquals(ImmutableSet.of(RECORD3, RECORD5), getDataFromFilter(cleanRecords.get(), filter));
}
use of co.cask.cdap.api.dataset.lib.PartitionedFileSet in project cdap by caskdata.
the class ClicksAndViewsMapReduceTest method getDataFromFile.
private Set<String> getDataFromFile() throws Exception {
DataSetManager<PartitionedFileSet> cleanRecords = getDataset(ClicksAndViews.JOINED);
Set<String> cleanData = new HashSet<>();
// we configured the MapReduce to write to this partition when starting it
PartitionDetail partition = cleanRecords.get().getPartition(PartitionKey.builder().addLongField("runtime", OUTPUT_PARTITION_RUNTIME).build());
Assert.assertNotNull(partition);
for (Location location : partition.getLocation().list()) {
if (location.getName().startsWith("part-")) {
try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(location.getInputStream()))) {
String line;
while ((line = bufferedReader.readLine()) != null) {
cleanData.add(line);
}
}
}
}
return cleanData;
}
Aggregations