use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionedFileSetTest method createPartition.
// creates a file under the partition path for the given key and adds it to the PartitionedFileSet
// writes the given int into the file and asserts that it exists
private Location createPartition(PartitionedFileSet pfs, PartitionKey key, String fileName, int intToWrite) throws IOException {
PartitionOutput partitionOutput = pfs.getPartitionOutput(key);
Location outputLocation = partitionOutput.getLocation().append(fileName);
Assert.assertFalse(outputLocation.exists());
try (OutputStream outputStream = outputLocation.getOutputStream()) {
outputStream.write(intToWrite);
}
Assert.assertTrue(outputLocation.exists());
partitionOutput.addPartition();
return outputLocation;
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class SparkFileSetTestRun method testSparkWithPartitionedFileSet.
private void testSparkWithPartitionedFileSet(ApplicationManager applicationManager, String sparkProgram) throws Exception {
DataSetManager<PartitionedFileSet> pfsManager = getDataset("pfs");
PartitionedFileSet pfs = pfsManager.get();
PartitionOutput partitionOutput = pfs.getPartitionOutput(PartitionKey.builder().addStringField("x", "nn").build());
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
pfsManager.flush();
Map<String, String> inputArgs = new HashMap<>();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addRangeCondition("x", "na", "nx").build());
Map<String, String> outputArgs = new HashMap<>();
PartitionKey outputKey = PartitionKey.builder().addStringField("x", "xx").build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
Map<String, String> args = new HashMap<>();
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", inputArgs));
args.putAll(RuntimeArguments.addScope(Scope.DATASET, "pfs", outputArgs));
args.put("input", "pfs");
args.put("output", "pfs");
SparkManager sparkManager = applicationManager.getSparkManager(sparkProgram).start(args);
sparkManager.waitForRun(ProgramRunStatus.COMPLETED, 10, TimeUnit.MINUTES);
pfsManager.flush();
PartitionDetail partition = pfs.getPartition(outputKey);
Assert.assertNotNull(partition);
validateFileOutput(partition.getLocation());
// Cleanup after test completed
pfs.dropPartition(partitionOutput.getPartitionKey());
pfs.dropPartition(partition.getPartitionKey());
pfsManager.flush();
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionConcatenateTest method testConcatenate.
/**
* 1. Write 100 small files (orc format) to a Partition of a PartitionedFileSet.
* 2. Execute a partition concatenate operation.
* 3. As compared to before the concatenate operation, validate that the number of files is reduced, while
* the contents of the files remains the same.
*/
@Test
public void testConcatenate() throws Exception {
String orcPFS = "orcPFS";
addDatasetInstance(PartitionedFileSet.class.getName(), orcPFS, PartitionedFileSetProperties.builder().setPartitioning(Partitioning.builder().addLongField("time").build()).setOutputFormat(OrcNewOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.ql.io.orc.OrcSerde").setExploreInputFormat("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat").setExploreSchema("record STRING").build());
// 1. create 100 small files in the input FileSet
DataSetManager<PartitionedFileSet> cleanRecordsManager = getDataset(orcPFS);
PartitionedFileSet cleanRecords = cleanRecordsManager.get();
PartitionKey outputPartition = PartitionKey.builder().addLongField("time", 5000).build();
PartitionOutput partitionOutput = cleanRecords.getPartitionOutput(outputPartition);
Location partitionLocation = partitionOutput.getLocation();
int numInputFiles = 100;
List<String> writtenData = writeSmallOrcFiles(partitionLocation, numInputFiles);
partitionOutput.addPartition();
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
// this is a timestamp before concatenating, but after writing the files
long beforeConcatTime = System.currentTimeMillis();
List<Location> dataFiles = listFilteredChildren(partitionLocation);
// each input file will result in one output file, due to the FileInputFormat class and FileOutputFormat class
// being used
Assert.assertEquals(numInputFiles, dataFiles.size());
for (Location dataFile : dataFiles) {
// all the files should have a lastModified smaller than now
Assert.assertTrue(dataFile.lastModified() < beforeConcatTime);
}
// 2. run the concatenate operation
cleanRecords.concatenatePartition(outputPartition).get();
// 3. check that the data files' lastModified timestamp is updated, and there should be fewer of them
dataFiles = listFilteredChildren(partitionLocation);
Assert.assertTrue(dataFiles.size() < numInputFiles);
// should have a lastModified larger than now
Assert.assertTrue(Iterables.getOnlyElement(dataFiles).lastModified() > beforeConcatTime);
// even though the files were concatenated, the explore results should be unchanged
Assert.assertEquals(writtenData, getExploreResults(orcPFS));
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class DynamicPartitionerWithAvroTest method writeFile.
private void writeFile(PartitionedFileSet pfs, PartitionKey key) throws IOException {
PartitionOutput partitionOutput = pfs.getPartitionOutput(key);
partitionOutput.getLocation().mkdirs();
partitionOutput.getLocation().append("file").createNew();
partitionOutput.setMetadata(ImmutableMap.of("file", "file"));
partitionOutput.addPartition();
}
use of io.cdap.cdap.api.dataset.lib.PartitionOutput in project cdap by caskdata.
the class PartitionedFileSetTest method testPartitionMetadata.
@Test
public void testPartitionMetadata() throws Exception {
final PartitionedFileSet dataset = dsFrameworkUtil.getInstance(pfsInstance);
dsFrameworkUtil.newTransactionExecutor((TransactionAware) dataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
PartitionKey partitionKey = PartitionKey.builder().addIntField("i", 42).addLongField("l", 17L).addStringField("s", "x").build();
ImmutableMap<String, String> metadata = ImmutableMap.of("key1", "value", "key2", "value2", "key3", "value2");
PartitionOutput partitionOutput = dataset.getPartitionOutput(partitionKey);
partitionOutput.setMetadata(metadata);
partitionOutput.addPartition();
PartitionDetail partitionDetail = dataset.getPartition(partitionKey);
Assert.assertNotNull(partitionDetail);
Assert.assertEquals(metadata, partitionDetail.getMetadata().asMap());
}
});
}
Aggregations