use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.
the class PartitionFilterTest method testBuilderGetterMatch.
@Test
public void testBuilderGetterMatch() {
long minute = TimeUnit.MINUTES.toMillis(1);
PartitionFilter filter = PartitionFilter.builder().addValueCondition("year", 2012).addRangeCondition("month", 4, 7).addValueCondition("market", "asia").addRangeCondition("duration", 60 * minute, 90 * minute).build();
Assert.assertEquals(4, filter.getConditions().size());
validateCondition(filter, "year", true, Partitioning.FieldType.INT, 2012, null, 2011, 2013);
validateCondition(filter, "month", false, Partitioning.FieldType.INT, 4, 5, 6, null, 3, 7, 8);
validateCondition(filter, "market", true, Partitioning.FieldType.STRING, "asia", null, "america", "", "europe");
validateCondition(filter, "duration", false, Partitioning.FieldType.LONG, 60 * minute, 80 * minute, 89 * minute, 90 * minute - 1, null, minute, 30 * minute, 60 * minute - 1, 90 * minute, Long.MAX_VALUE, Long.MIN_VALUE, 0L);
// should match
Assert.assertTrue(filter.match(PartitionKey.builder().addField("month", 4).addField("duration", 75 * minute).addField("market", "asia").addField("year", 2012).build()));
// out of range
Assert.assertFalse(filter.match(PartitionKey.builder().addField("month", 7).addField("duration", 75 * minute).addField("year", 2012).build()));
// field missing
Assert.assertFalse(filter.match(PartitionKey.builder().addField("month", 4).addField("duration", 75 * minute).addField("year", 2012).build()));
// extra field
Assert.assertTrue(filter.match(PartitionKey.builder().addField("day", "tue").addField("month", 4).addField("duration", 75 * minute).addField("year", 2012).addField("market", "asia").build()));
}
use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.
the class PartitionFilterTest method testBuilderNullRange.
@Test
public void testBuilderNullRange() {
PartitionFilter filter = PartitionFilter.builder().addValueCondition("a", 1).<Long>addRangeCondition("x", null, null).build();
// only the one for "a"
Assert.assertEquals(1, filter.getConditions().size());
Assert.assertNull(filter.getCondition("x"));
}
use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.
the class PartitionedFileSetArgumentsTest method testSetGetInputPartitionFilter.
@Test
public void testSetGetInputPartitionFilter() throws Exception {
Map<String, String> arguments = new HashMap<>();
PartitionFilter filter = PartitionFilter.builder().addRangeCondition("i", 30, 40).addValueCondition("l", 17L).addValueCondition("s", "x").build();
PartitionedFileSetArguments.setInputPartitionFilter(arguments, filter);
Assert.assertEquals(filter, PartitionedFileSetArguments.getInputPartitionFilter(arguments));
arguments = new HashMap<>();
filter = PartitionFilter.builder().addRangeCondition("i", 30, 40).addValueCondition("s", "x").build();
PartitionedFileSetArguments.setInputPartitionFilter(arguments, filter);
Assert.assertEquals(filter, PartitionedFileSetArguments.getInputPartitionFilter(arguments));
arguments = new HashMap<>();
filter = PartitionFilter.ALWAYS_MATCH;
PartitionedFileSetArguments.setInputPartitionFilter(arguments, filter);
Assert.assertEquals(filter, PartitionedFileSetArguments.getInputPartitionFilter(arguments));
}
use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.
the class DataCleansingMapReduceTest method testPartitionConsuming.
@Test
public void testPartitionConsuming() throws Exception {
ApplicationManager applicationManager = deployApplication(DataCleansing.class);
ServiceManager serviceManager = applicationManager.getServiceManager(DataCleansingService.NAME).start();
serviceManager.waitForStatus(true);
URL serviceURL = serviceManager.getServiceURL();
// write a set of records to one partition and run the DataCleansingMapReduce job on that one partition
createPartition(serviceURL, RECORD_SET1);
// before starting the MR, there are 0 invalid records and 0 valid records, according to metrics
Assert.assertEquals(0, getValidityMetrics(true));
Assert.assertEquals(0, getValidityMetrics(false));
Long now = System.currentTimeMillis();
ImmutableMap<String, String> args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
MapReduceManager mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(RECORD_SET1, true));
compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(RECORD_SET1, false));
// assert that some of the records have indeed been filtered
Assert.assertNotEquals(filterRecords(RECORD_SET1, true), RECORD_SET1);
Assert.assertNotEquals(filterRecords(RECORD_SET1, false), Collections.<String>emptySet());
// verify this via metrics
Assert.assertEquals(1, getValidityMetrics(true));
Assert.assertEquals(1, getValidityMetrics(false));
// create two additional partitions
createPartition(serviceURL, RECORD_SET2);
createPartition(serviceURL, RECORD_SET3);
// running the MapReduce job now processes these two new partitions (RECORD_SET1 and RECORD_SET2) and creates a new
// partition with with the output
now = System.currentTimeMillis();
args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
ImmutableSet<String> recordSets2and3 = ImmutableSet.<String>builder().addAll(RECORD_SET2).addAll(RECORD_SET3).build();
compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(recordSets2and3, true));
compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(recordSets2and3, false));
// verify this via metrics
Assert.assertEquals(1, getValidityMetrics(true));
Assert.assertEquals(5, getValidityMetrics(false));
// running the MapReduce job without adding new partitions creates no additional output
now = System.currentTimeMillis();
args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
compareData(now, DataCleansing.CLEAN_RECORDS, Collections.<String>emptySet());
compareData(now, DataCleansing.INVALID_RECORDS, Collections.<String>emptySet());
// verify that the records were properly partitioned on their zip
DataSetManager<PartitionedFileSet> cleanRecords = getDataset(DataCleansing.CLEAN_RECORDS);
PartitionFilter filter = PartitionFilter.builder().addValueCondition("zip", 84125).build();
Assert.assertEquals(ImmutableSet.of(RECORD1, RECORD4, RECORD6), getDataFromFilter(cleanRecords.get(), filter));
filter = PartitionFilter.builder().addValueCondition("zip", 84126).build();
Assert.assertEquals(ImmutableSet.of(RECORD3, RECORD5), getDataFromFilter(cleanRecords.get(), filter));
}
use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.
the class TimePartitionedFileSetDataset method partitionFiltersForTimeRange.
// returns a list of partition filters that cover that specified time range.
// this may return a list with a single null filter (in case the range is unbounded in both directions)
@VisibleForTesting
static List<PartitionFilter> partitionFiltersForTimeRange(long startTime, long endTime) {
// unsatisfiable range
if (startTime >= endTime) {
return Collections.emptyList();
}
PartitionKey keyLower = startTime <= 0 ? null : partitionKeyForTime(startTime);
PartitionKey keyUpper = endTime == Long.MAX_VALUE ? null : partitionKeyForTime(endTime);
// no bounds -> no filter
if (keyLower == null && keyUpper == null) {
// no filter needed to select all time
return Collections.singletonList(null);
}
List<PartitionFilter> filters = Lists.newArrayList();
String[] allFields = PARTITIONING.getFields().keySet().toArray(new String[PARTITIONING.getFields().size()]);
// if there is no lower bound, we only need the filters for the upper bound
if (keyLower == null) {
addUpperFilters(allFields, 0, keyUpper, filters, initialSupplier());
return filters;
}
// if there is no upper bound, we only need the filters for the lower bound
if (keyUpper == null) {
addLowerFilters(allFields, 0, keyLower, filters, initialSupplier());
return filters;
}
return filtersFor(allFields, 0, keyLower, keyUpper, filters, initialSupplier());
}
Aggregations