use of org.apache.twill.filesystem.Location in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
use of org.apache.twill.filesystem.Location in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithFileSet.
private void testSparkWithFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<FileSet> filesetManager = getDataset("fs");
FileSet fileset = filesetManager.get();
Location location = fileset.getLocation("nn");
prepareFileInput(location);
Map<String, String> inputArgs = new HashMap<>();
FileSetArguments.setInputPath(inputArgs, "nn");
Map<String, String> outputArgs = new HashMap<>();
FileSetArguments.setOutputPath(inputArgs, "xx");
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "fs", outputArgs));
args.put("input", "fs");
args.put("output", "fs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 1, TimeUnit.MINUTES);
validateFileOutput(fileset.getLocation("xx"), "custom:");
// Cleanup paths after running test
fileset.getLocation("nn").delete(true);
fileset.getLocation("xx").delete(true);
}
use of org.apache.twill.filesystem.Location in project cdap by caskdata.
the class DynamicPartitioningTestRun method validateFiles.
private void validateFiles(String dataset, Location expectedExisting) throws Exception {
DataSetManager<PartitionedFileSet> pfs = getDataset(testSpace.dataset(dataset));
Location base = pfs.get().getEmbeddedFileSet().getBaseLocation();
validateFiles(base, expectedExisting);
}
use of org.apache.twill.filesystem.Location in project cdap by caskdata.
the class FileUploadServiceTestRun method testFileUploadService.
@Test
public void testFileUploadService() throws Exception {
ApplicationManager appManager = deployApplication(FileUploadApp.class);
// Start the service
ServiceManager serviceManager = appManager.getServiceManager(FileUploadApp.SERVICE_NAME).start();
serviceManager.waitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
try {
// Upload URL is "base/upload/pfs/[partition_value], which the partition value is a long
URI serviceURI = serviceManager.getServiceURL(10, TimeUnit.SECONDS).toURI();
// Upload with wrong MD5, should get 400.
byte[] content = Strings.repeat("0123456789 ", 100).getBytes(Charsets.UTF_8);
Assert.assertEquals(HttpURLConnection.HTTP_BAD_REQUEST, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, "123", 30));
long beforeUploadTime = System.currentTimeMillis();
// Upload with right MD5, should get 200
Assert.assertEquals(HttpURLConnection.HTTP_OK, upload(serviceURI.resolve("upload/" + FileUploadApp.PFS_NAME + "/1").toURL(), content, BaseEncoding.base64().encode(Hashing.md5().hashBytes(content).asBytes()), 20));
// Inspect the partitioned file set and verify the content
PartitionedFileSet pfs = (PartitionedFileSet) getDataset(FileUploadApp.PFS_NAME).get();
PartitionDetail partition = pfs.getPartition(PartitionKey.builder().addLongField("time", 1).build());
Assert.assertNotNull(partition);
// Verify a notification should have been published for the new partition
List<Notification> notifications = getDataNotifications(beforeUploadTime);
// Should have one message
Assert.assertEquals(1, notifications.size());
verifyDataNotification(notifications.get(0), NamespaceId.DEFAULT.dataset(FileUploadApp.PFS_NAME), Collections.singletonList(PartitionKey.builder().addLongField("time", 1L).build()));
// There should be one file under the partition directory
List<Location> locations = partition.getLocation().list();
Assert.assertEquals(1, locations.size());
Assert.assertArrayEquals(content, ByteStreams.toByteArray(Locations.newInputSupplier(locations.get(0))));
// Verify the tracking table of chunks sizes
KeyValueTable trackingTable = (KeyValueTable) getDataset(FileUploadApp.KV_TABLE_NAME).get();
CloseableIterator<KeyValue<byte[], byte[]>> iter = trackingTable.scan(null, null);
// Sum up all chunks sizes as being tracked by the tracking table.
long sum = 0;
int iterSize = 0;
while (iter.hasNext()) {
KeyValue<byte[], byte[]> kv = iter.next();
sum += Bytes.toInt(kv.getKey()) * Bytes.toLong(kv.getValue());
iterSize++;
}
// The iterator should have size >= 2, since we uses different chunk size for two different upload
Assert.assertTrue(iterSize >= 2);
// The sum of all chunks sizes should be the same as the
// content size * 2 (since we have one failure, one success upload)
Assert.assertEquals(content.length * 2, sum);
} finally {
serviceManager.stop();
serviceManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
}
}
use of org.apache.twill.filesystem.Location in project cdap by caskdata.
the class PartitionConcatenateTest method testConcatenate.
/**
* 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
* 2. Execute a partition concatenate operation.
* 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
* the contents of the files remains the same.
*/
@Test
public void testConcatenate() throws Exception {
String orcPFS = "orcPFS";
addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
// 1. create 100 small files in the input FileSet
DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
PartitionedFileSet cleanRecords = cleanRecordsManager.get();
PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
Location partitionLocation = partitionOutput.getLocation();
int numInputFiles = 100;
List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
partitionOutput.addPartition();
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
// this is a timestamp before concatenating, but after writing the files
long beforeConcatTime = System.currentTimeMillis();
List<Location> dataFiles = listFilteredChildren(partitionLocation);
// each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
// being used
Assert.assertEquals(numInputFiles, dataFiles.size());
for (Location dataFile : dataFiles) {
// all the files should have a lastModified smaller than now
Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
}
// 2. run the concatenate operation
cleanRecords.concatenatePartition(outputPartition).get();
// 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
dataFiles = listFilteredChildren(partitionLocation);
Assert.assertTrue(dataFiles.size() < numInputFiles);
// should have a lastModified larger than now
Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
// even though the files were concatenated, the explore results should be unchanged
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}
Aggregations