Search in sources :

Example 1 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class PartitionFilterTest method testBuilderGetterMatch.

@Test
public void testBuilderGetterMatch() {
    long minute = TimeUnit.MINUTES.toMillis(1);
    PartitionFilter filter = PartitionFilter.builder().addValueCondition("year", 2012).addRangeCondition("month", 4, 7).addValueCondition("market", "asia").addRangeCondition("duration", 60 * minute, 90 * minute).build();
    Assert.assertEquals(4, filter.getConditions().size());
    validateCondition(filter, "year", true, Partitioning.FieldType.INT, 2012, null, 2011, 2013);
    validateCondition(filter, "month", false, Partitioning.FieldType.INT, 4, 5, 6, null, 3, 7, 8);
    validateCondition(filter, "market", true, Partitioning.FieldType.STRING, "asia", null, "america", "", "europe");
    validateCondition(filter, "duration", false, Partitioning.FieldType.LONG, 60 * minute, 80 * minute, 89 * minute, 90 * minute - 1, null, minute, 30 * minute, 60 * minute - 1, 90 * minute, Long.MAX_VALUE, Long.MIN_VALUE, 0L);
    // should match
    Assert.assertTrue(filter.match(PartitionKey.builder().addField("month", 4).addField("duration", 75 * minute).addField("market", "asia").addField("year", 2012).build()));
    // out of range
    Assert.assertFalse(filter.match(PartitionKey.builder().addField("month", 7).addField("duration", 75 * minute).addField("year", 2012).build()));
    // field missing
    Assert.assertFalse(filter.match(PartitionKey.builder().addField("month", 4).addField("duration", 75 * minute).addField("year", 2012).build()));
    // extra field
    Assert.assertTrue(filter.match(PartitionKey.builder().addField("day", "tue").addField("month", 4).addField("duration", 75 * minute).addField("year", 2012).addField("market", "asia").build()));
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) Test(org.junit.Test)

Example 2 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class PartitionFilterTest method testBuilderNullRange.

@Test
public void testBuilderNullRange() {
    PartitionFilter filter = PartitionFilter.builder().addValueCondition("a", 1).<Long>addRangeCondition("x", null, null).build();
    // only the one for "a"
    Assert.assertEquals(1, filter.getConditions().size());
    Assert.assertNull(filter.getCondition("x"));
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) Test(org.junit.Test)

Example 3 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class PartitionedFileSetArgumentsTest method testSetGetInputPartitionFilter.

@Test
public void testSetGetInputPartitionFilter() throws Exception {
    Map<String, String> arguments = new HashMap<>();
    PartitionFilter filter = PartitionFilter.builder().addRangeCondition("i", 30, 40).addValueCondition("l", 17L).addValueCondition("s", "x").build();
    PartitionedFileSetArguments.setInputPartitionFilter(arguments, filter);
    Assert.assertEquals(filter, PartitionedFileSetArguments.getInputPartitionFilter(arguments));
    arguments = new HashMap<>();
    filter = PartitionFilter.builder().addRangeCondition("i", 30, 40).addValueCondition("s", "x").build();
    PartitionedFileSetArguments.setInputPartitionFilter(arguments, filter);
    Assert.assertEquals(filter, PartitionedFileSetArguments.getInputPartitionFilter(arguments));
    arguments = new HashMap<>();
    filter = PartitionFilter.ALWAYS_MATCH;
    PartitionedFileSetArguments.setInputPartitionFilter(arguments, filter);
    Assert.assertEquals(filter, PartitionedFileSetArguments.getInputPartitionFilter(arguments));
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 4 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class DataCleansingMapReduceTest method testPartitionConsuming.

@Test
public void testPartitionConsuming() throws Exception {
    ApplicationManager applicationManager = deployApplication(DataCleansing.class);
    ServiceManager serviceManager = applicationManager.getServiceManager(DataCleansingService.NAME).start();
    serviceManager.waitForStatus(true);
    URL serviceURL = serviceManager.getServiceURL();
    // write a set of records to one partition and run the DataCleansingMapReduce job on that one partition
    createPartition(serviceURL, RECORD_SET1);
    // before starting the MR, there are 0 invalid records and 0 valid records, according to metrics
    Assert.assertEquals(0, getValidityMetrics(true));
    Assert.assertEquals(0, getValidityMetrics(false));
    Long now = System.currentTimeMillis();
    ImmutableMap<String, String> args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
    MapReduceManager mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
    mapReduceManager.waitForRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(RECORD_SET1, true));
    compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(RECORD_SET1, false));
    // assert that some of the records have indeed been filtered
    Assert.assertNotEquals(filterRecords(RECORD_SET1, true), RECORD_SET1);
    Assert.assertNotEquals(filterRecords(RECORD_SET1, false), Collections.<String>emptySet());
    // verify this via metrics
    Assert.assertEquals(1, getValidityMetrics(true));
    Assert.assertEquals(1, getValidityMetrics(false));
    // create two additional partitions
    createPartition(serviceURL, RECORD_SET2);
    createPartition(serviceURL, RECORD_SET3);
    // running the MapReduce job now processes these two new partitions (RECORD_SET1 and RECORD_SET2) and creates a new
    // partition with with the output
    now = System.currentTimeMillis();
    args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
    mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
    mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 2, 5, TimeUnit.MINUTES);
    ImmutableSet<String> recordSets2and3 = ImmutableSet.<String>builder().addAll(RECORD_SET2).addAll(RECORD_SET3).build();
    compareData(now, DataCleansing.CLEAN_RECORDS, filterRecords(recordSets2and3, true));
    compareData(now, DataCleansing.INVALID_RECORDS, filterRecords(recordSets2and3, false));
    // verify this via metrics
    Assert.assertEquals(1, getValidityMetrics(true));
    Assert.assertEquals(5, getValidityMetrics(false));
    // running the MapReduce job without adding new partitions creates no additional output
    now = System.currentTimeMillis();
    args = ImmutableMap.of(DataCleansingMapReduce.OUTPUT_PARTITION_KEY, now.toString(), DataCleansingMapReduce.SCHEMA_KEY, schemaJson);
    mapReduceManager = applicationManager.getMapReduceManager(DataCleansingMapReduce.NAME).start(args);
    mapReduceManager.waitForRuns(ProgramRunStatus.COMPLETED, 3, 5, TimeUnit.MINUTES);
    compareData(now, DataCleansing.CLEAN_RECORDS, Collections.<String>emptySet());
    compareData(now, DataCleansing.INVALID_RECORDS, Collections.<String>emptySet());
    // verify that the records were properly partitioned on their zip
    DataSetManager<PartitionedFileSet> cleanRecords = getDataset(DataCleansing.CLEAN_RECORDS);
    PartitionFilter filter = PartitionFilter.builder().addValueCondition("zip", 84125).build();
    Assert.assertEquals(ImmutableSet.of(RECORD1, RECORD4, RECORD6), getDataFromFilter(cleanRecords.get(), filter));
    filter = PartitionFilter.builder().addValueCondition("zip", 84126).build();
    Assert.assertEquals(ImmutableSet.of(RECORD3, RECORD5), getDataFromFilter(cleanRecords.get(), filter));
}
Also used : ApplicationManager(co.cask.cdap.test.ApplicationManager) MapReduceManager(co.cask.cdap.test.MapReduceManager) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) ServiceManager(co.cask.cdap.test.ServiceManager) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) URL(java.net.URL) Test(org.junit.Test)

Example 5 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class TimePartitionedFileSetDataset method partitionFiltersForTimeRange.

// returns a list of partition filters that cover that specified time range.
// this may return a list with a single null filter (in case the range is unbounded in both directions)
@VisibleForTesting
static List<PartitionFilter> partitionFiltersForTimeRange(long startTime, long endTime) {
    // unsatisfiable range
    if (startTime >= endTime) {
        return Collections.emptyList();
    }
    PartitionKey keyLower = startTime <= 0 ? null : partitionKeyForTime(startTime);
    PartitionKey keyUpper = endTime == Long.MAX_VALUE ? null : partitionKeyForTime(endTime);
    // no bounds -> no filter
    if (keyLower == null && keyUpper == null) {
        // no filter needed to select all time
        return Collections.singletonList(null);
    }
    List<PartitionFilter> filters = Lists.newArrayList();
    String[] allFields = PARTITIONING.getFields().keySet().toArray(new String[PARTITIONING.getFields().size()]);
    // if there is no lower bound, we only need the filters for the upper bound
    if (keyLower == null) {
        addUpperFilters(allFields, 0, keyUpper, filters, initialSupplier());
        return filters;
    }
    // if there is no upper bound, we only need the filters for the lower bound
    if (keyUpper == null) {
        addLowerFilters(allFields, 0, keyLower, filters, initialSupplier());
        return filters;
    }
    return filtersFor(allFields, 0, keyLower, keyUpper, filters, initialSupplier());
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)17 Test (org.junit.Test)12 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)7 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)6 TransactionAware (org.apache.tephra.TransactionAware)6 TransactionExecutor (org.apache.tephra.TransactionExecutor)5 Partition (co.cask.cdap.api.dataset.lib.Partition)4 Predicate (co.cask.cdap.api.Predicate)3 DataSetException (co.cask.cdap.api.dataset.DataSetException)3 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)3 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)3 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)2 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)2 FieldType (co.cask.cdap.api.dataset.lib.Partitioning.FieldType)2 Row (co.cask.cdap.api.dataset.table.Row)2 Table (co.cask.cdap.api.dataset.table.Table)2 ApplicationWithPrograms (co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms)2 BasicArguments (co.cask.cdap.internal.app.runtime.BasicArguments)2