use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class StreamConversionMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Job job = context.getHadoopJob();
job.setMapperClass(StreamConversionMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(AvroKey.class);
job.setMapOutputValueClass(NullWritable.class);
AvroJob.setOutputKeySchema(job, SCHEMA);
// read 5 minutes of events from the stream, ending at the logical start time of this run
long logicalTime = context.getLogicalStartTime();
context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime));
// each run writes its output to a partition with the logical start time.
TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime);
context.addOutput(Output.ofDataset("converted", dsArguments));
TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments);
LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation());
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class PartitionCorrectorTestRun method createPartition.
private void createPartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long time, int i) throws Exception {
TimePartitionedFileSet tpfs = tpfsManager.get();
TimePartitionOutput output = tpfs.getPartitionOutput(time);
try (PrintStream out = new PrintStream(output.getLocation().append("file").getOutputStream())) {
out.println(String.format("%d,x%d", i, i));
}
output.addPartition();
tpfsManager.flush();
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class SparkFileSetTestRun method addTimePartition.
private void addTimePartition(DataSetManager<TimePartitionedFileSet> tpfsManager, long inputTime) throws IOException, TransactionFailureException, InterruptedException {
TimePartitionedFileSet tpfs = tpfsManager.get();
PartitionOutput partitionOutput = tpfs.getPartitionOutput(inputTime);
Location location = partitionOutput.getLocation();
prepareFileInput(location);
partitionOutput.addPartition();
tpfsManager.flush();
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class StreamConversionTest method testStreamConversion.
@Test
public void testStreamConversion() throws Exception {
// Deploy the PurchaseApp application
ApplicationManager appManager = deployApplication(StreamConversionApp.class);
// send some data to the events stream
StreamManager streamManager = getStreamManager("events");
streamManager.send("15");
streamManager.send("16");
streamManager.send("17");
// record the current time. Add 1 in case the stream events are added with the same timestamp as the current time.
final long startTime = System.currentTimeMillis() + 1;
// run the mapreduce
MapReduceManager mapReduceManager = appManager.getMapReduceManager("StreamConversionMapReduce").start(ImmutableMap.of("logical.start.time", Long.toString(startTime)));
mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
// verify the single partition in the file set
DataSetManager<TimePartitionedFileSet> fileSetManager = getDataset("converted");
Assert.assertNotNull(fileSetManager.get().getPartitionByTime(startTime));
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis(startTime);
int year = calendar.get(Calendar.YEAR);
int month = calendar.get(Calendar.MONTH) + 1;
int day = calendar.get(Calendar.DAY_OF_MONTH);
int hour = calendar.get(Calendar.HOUR_OF_DAY);
int minute = calendar.get(Calendar.MINUTE);
// query with SQL
Connection connection = getQueryClient();
ResultSet results = connection.prepareStatement("SELECT year, month, day, hour, minute " + "FROM dataset_converted " + "WHERE body = '17'").executeQuery();
// should return only one row, with correct time fields
Assert.assertTrue(results.next());
Assert.assertEquals(year, results.getInt(1));
Assert.assertEquals(month, results.getInt(2));
Assert.assertEquals(day, results.getInt(3));
Assert.assertEquals(hour, results.getInt(4));
Assert.assertEquals(minute, results.getInt(5));
Assert.assertFalse(results.next());
}
use of co.cask.cdap.api.dataset.lib.TimePartitionedFileSet in project cdap by caskdata.
the class TimePartitionedFileSetTest method testTimePartitionedInputArguments.
@Test
public void testTimePartitionedInputArguments() throws Exception {
final long time8 = DATE_FORMAT.parse("10/17/2014 8:42 am").getTime();
final long time9 = DATE_FORMAT.parse("10/17/2014 9:42 am").getTime();
final String path8 = "8:42";
final String path9 = "9:42";
final PartitionFilter filter9 = PartitionFilter.builder().addRangeCondition("hour", 9, null).build();
// add a few partitions
{
final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
final TransactionAware txAwareDataset = (TransactionAware) dataset;
dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
@Override
public void apply() throws Exception {
dataset.addPartition(time8, path8);
dataset.addPartition(time9, path9);
}
});
}
// test specifying time range for input
Map<String, String> arguments = Maps.newHashMap();
TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 - 30 * MINUTE);
TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
testInputConfiguration(arguments, path8);
// add a partition filter. it should not have an effect as long as there is a time range
TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
testInputConfiguration(arguments, path8);
// test specifying input with a partition filter
arguments.clear();
TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
testInputConfiguration(arguments, path9);
// test specifying only a start time or only an end time for input, or none
arguments.clear();
TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 + 30 * MINUTE);
testInputConfigurationFailure(arguments, " with only a start time");
arguments.clear();
TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
testInputConfigurationFailure(arguments, " with only an end time");
}
Aggregations