Search in sources :

Example 11 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class TimePartitionedFileSetTest method testTimePartitionedInputArguments.

@Test
public void testTimePartitionedInputArguments() throws Exception {
    final long time8 = DATE_FORMAT.parse("10/17/2014 8:42 am").getTime();
    final long time9 = DATE_FORMAT.parse("10/17/2014 9:42 am").getTime();
    final String path8 = "8:42";
    final String path9 = "9:42";
    final PartitionFilter filter9 = PartitionFilter.builder().addRangeCondition("hour", 9, null).build();
    // add a few partitions
    {
        final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE);
        final TransactionAware txAwareDataset = (TransactionAware) dataset;
        dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {

            @Override
            public void apply() throws Exception {
                dataset.addPartition(time8, path8);
                dataset.addPartition(time9, path9);
            }
        });
    }
    // test specifying time range for input
    Map<String, String> arguments = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 - 30 * MINUTE);
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
    testInputConfiguration(arguments, path8);
    // add a partition filter. it should not have an effect as long as there is a time range
    TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
    testInputConfiguration(arguments, path8);
    // test specifying input with a partition filter
    arguments.clear();
    TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9);
    testInputConfiguration(arguments, path9);
    // test specifying only a start time or only an end time for input, or none
    arguments.clear();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 + 30 * MINUTE);
    testInputConfigurationFailure(arguments, " with only a start time");
    arguments.clear();
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE);
    testInputConfigurationFailure(arguments, " with only an end time");
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) TransactionAware(org.apache.tephra.TransactionAware) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet) Test(org.junit.Test)

Example 12 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class PartitionFilterTest method testIncompatibleMatch.

@Test(expected = IllegalArgumentException.class)
public void testIncompatibleMatch() {
    PartitionFilter filter = PartitionFilter.builder().addValueCondition("year", 2012).addRangeCondition("month", 4, 7).addValueCondition("market", "asia").build();
    // field of incompatible type
    filter.match(PartitionKey.builder().addField("month", "january").addField("market", "latin").addField("year", 2012).build());
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) Test(org.junit.Test)

Example 13 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class DataCleansingMapReduceTest method getDataFromFile.

private Set<String> getDataFromFile(Long time, String dsName) throws Exception {
    DataSetManager<PartitionedFileSet> cleanRecords = getDataset(dsName);
    PartitionFilter filter = PartitionFilter.builder().addValueCondition("time", time).build();
    return getDataFromFilter(cleanRecords.get(), filter);
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet)

Example 14 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class PartitionedFileSetDataset method generateStartKey.

private byte[] generateStartKey(PartitionFilter filter) {
    if (null == filter) {
        return null;
    }
    // validate partition filter, convert values, and compute size of output
    Map<String, FieldType> partitionFields = partitioning.getFields();
    int totalSize = 0;
    ArrayList<byte[]> values = Lists.newArrayListWithCapacity(partitionFields.size());
    for (Map.Entry<String, FieldType> entry : partitionFields.entrySet()) {
        String fieldName = entry.getKey();
        FieldType fieldType = entry.getValue();
        PartitionFilter.Condition<? extends Comparable> condition = filter.getCondition(fieldName);
        if (condition == null) {
            // this field is not present; we can't include any more fields in the start key
            break;
        }
        Comparable lowerValue = condition.getLower();
        if (lowerValue == null) {
            // this field has no lower bound; we can't include any more fields in the start key
            break;
        }
        try {
            fieldType.validate(lowerValue);
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException(String.format("Invalid partition filter: Lower bound for field '%s' is incompatible with the partitioning: %s", fieldName, e.getMessage()));
        }
        byte[] bytes = FieldTypes.toBytes(lowerValue, fieldType);
        totalSize += bytes.length;
        values.add(bytes);
    }
    if (values.isEmpty()) {
        return null;
    }
    // one \0 between each of the fields
    totalSize += values.size() - 1;
    byte[] startKey = new byte[totalSize];
    int offset = 0;
    for (byte[] bytes : values) {
        System.arraycopy(bytes, 0, startKey, offset, bytes.length);
        // this leaves a \0 byte after the value
        offset += bytes.length + 1;
    }
    return startKey;
}
Also used : PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) FieldType(co.cask.cdap.api.dataset.lib.Partitioning.FieldType)

Example 15 with PartitionFilter

use of co.cask.cdap.api.dataset.lib.PartitionFilter in project cdap by caskdata.

the class PartitionedFileSetDataset method generateStopKey.

private byte[] generateStopKey(PartitionFilter filter) {
    if (null == filter) {
        return null;
    }
    // validate partition filter, convert values, and compute size of output
    Map<String, FieldType> partitionFields = partitioning.getFields();
    int totalSize = 0;
    boolean allSingleValue = true;
    ArrayList<byte[]> values = Lists.newArrayListWithCapacity(partitionFields.size());
    for (Map.Entry<String, FieldType> entry : partitionFields.entrySet()) {
        String fieldName = entry.getKey();
        FieldType fieldType = entry.getValue();
        PartitionFilter.Condition<? extends Comparable> condition = filter.getCondition(fieldName);
        if (condition == null) {
            // this field is not present; we can't include any more fields in the stop key
            break;
        }
        Comparable upperValue = condition.getUpper();
        if (upperValue == null) {
            // this field is not present; we can't include any more fields in the stop key
            break;
        }
        try {
            fieldType.validate(upperValue);
        } catch (IllegalArgumentException e) {
            throw new IllegalArgumentException(String.format("Invalid partition filter: Upper bound for field '%s' is incompatible with the partitioning: %s", fieldName, e.getMessage()));
        }
        byte[] bytes = FieldTypes.toBytes(upperValue, fieldType);
        totalSize += bytes.length;
        values.add(bytes);
        if (!condition.isSingleValue()) {
            allSingleValue = false;
            // upper bound for this field, following fields don't matter
            break;
        }
    }
    if (values.isEmpty()) {
        return null;
    }
    // one \0 between each of the fields
    totalSize += values.size() - 1;
    if (allSingleValue) {
        // in this case the start and stop key are equal, we append one \1 to ensure the scan is not empty
        totalSize++;
    }
    byte[] stopKey = new byte[totalSize];
    int offset = 0;
    for (byte[] bytes : values) {
        System.arraycopy(bytes, 0, stopKey, offset, bytes.length);
        // this leaves a \0 byte after the value
        offset += bytes.length + 1;
        if (allSingleValue && offset == stopKey.length) {
            // see above - we \1 instead of \0 at the end, to make sure scan is not empty
            stopKey[offset - 1] = 1;
        }
    }
    return stopKey;
}
Also used : FieldType(co.cask.cdap.api.dataset.lib.Partitioning.FieldType) PartitionFilter(co.cask.cdap.api.dataset.lib.PartitionFilter) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap)

Aggregations

PartitionFilter (co.cask.cdap.api.dataset.lib.PartitionFilter)17 Test (org.junit.Test)12 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)7 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)6 TransactionAware (org.apache.tephra.TransactionAware)6 TransactionExecutor (org.apache.tephra.TransactionExecutor)5 Partition (co.cask.cdap.api.dataset.lib.Partition)4 Predicate (co.cask.cdap.api.Predicate)3 DataSetException (co.cask.cdap.api.dataset.DataSetException)3 PartitionDetail (co.cask.cdap.api.dataset.lib.PartitionDetail)3 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)3 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 PartitionNotFoundException (co.cask.cdap.api.dataset.PartitionNotFoundException)2 PartitionAlreadyExistsException (co.cask.cdap.api.dataset.lib.PartitionAlreadyExistsException)2 FieldType (co.cask.cdap.api.dataset.lib.Partitioning.FieldType)2 Row (co.cask.cdap.api.dataset.table.Row)2 Table (co.cask.cdap.api.dataset.table.Table)2 ApplicationWithPrograms (co.cask.cdap.internal.app.deploy.pipeline.ApplicationWithPrograms)2 BasicArguments (co.cask.cdap.internal.app.runtime.BasicArguments)2