Search in sources :

Example 1 with WindowDataManager

use of org.apache.apex.malhar.lib.wal.WindowDataManager in project apex-malhar by apache.

the class TestNiFiInputApplication method populateDAG.

@Override
public void populateDAG(DAG dag, Configuration conf) {
    final SiteToSiteClientConfig clientConfig = new SiteToSiteClient.Builder().url("http://localhost:8080/nifi").portName("Apex").requestBatchCount(5).buildConfig();
    final SiteToSiteClient.Builder builder = new SiteToSiteClient.Builder().fromConfig(clientConfig);
    final WindowDataManager windowDataManager = new WindowDataManager.NoopWindowDataManager();
    NiFiSinglePortInputOperator nifi = dag.addOperator("nifi", new NiFiSinglePortInputOperator(builder, windowDataManager));
    ConsoleOutputOperator console = dag.addOperator("console", new ConsoleOutputOperator());
    dag.addStream("nifi_console", nifi.outputPort, console.input).setLocality(null);
}
Also used : SiteToSiteClient(org.apache.nifi.remote.client.SiteToSiteClient) ConsoleOutputOperator(org.apache.apex.malhar.lib.io.ConsoleOutputOperator) SiteToSiteClientConfig(org.apache.nifi.remote.client.SiteToSiteClientConfig) NiFiSinglePortInputOperator(org.apache.apex.malhar.contrib.nifi.NiFiSinglePortInputOperator) WindowDataManager(org.apache.apex.malhar.lib.wal.WindowDataManager)

Example 2 with WindowDataManager

use of org.apache.apex.malhar.lib.wal.WindowDataManager in project apex-malhar by apache.

the class AbstractFileInputOperator method definePartitions.

@Override
public Collection<Partition<AbstractFileInputOperator<T>>> definePartitions(Collection<Partition<AbstractFileInputOperator<T>>> partitions, PartitioningContext context) {
    lastRepartition = System.currentTimeMillis();
    int totalCount = getNewPartitionCount(partitions, context);
    LOG.debug("Computed new partitions: {}", totalCount);
    if (totalCount == partitions.size()) {
        return partitions;
    }
    AbstractFileInputOperator<T> tempOperator = partitions.iterator().next().getPartitionedInstance();
    MutableLong tempGlobalNumberOfRetries = tempOperator.globalNumberOfRetries;
    MutableLong tempGlobalNumberOfFailures = tempOperator.globalNumberOfRetries;
    /*
     * Build collective state from all instances of the operator.
     */
    Set<String> totalProcessedFiles = Sets.newHashSet();
    Set<FailedFile> currentFiles = Sets.newHashSet();
    List<DirectoryScanner> oldscanners = Lists.newLinkedList();
    List<FailedFile> totalFailedFiles = Lists.newLinkedList();
    List<String> totalPendingFiles = Lists.newLinkedList();
    Set<Integer> deletedOperators = Sets.newHashSet();
    for (Partition<AbstractFileInputOperator<T>> partition : partitions) {
        AbstractFileInputOperator<T> oper = partition.getPartitionedInstance();
        totalProcessedFiles.addAll(oper.processedFiles);
        totalFailedFiles.addAll(oper.failedFiles);
        totalPendingFiles.addAll(oper.pendingFiles);
        currentFiles.addAll(unfinishedFiles);
        tempGlobalNumberOfRetries.add(oper.localNumberOfRetries);
        tempGlobalNumberOfFailures.add(oper.localNumberOfFailures);
        if (oper.currentFile != null) {
            currentFiles.add(new FailedFile(oper.currentFile, oper.offset));
        }
        oldscanners.add(oper.getScanner());
        deletedOperators.add(oper.operatorId);
    }
    /*
     * Create partitions of scanners, scanner's partition method will do state
     * transfer for DirectoryScanner objects.
     */
    List<DirectoryScanner> scanners = scanner.partition(totalCount, oldscanners);
    Collection<Partition<AbstractFileInputOperator<T>>> newPartitions = Lists.newArrayListWithExpectedSize(totalCount);
    List<WindowDataManager> newManagers = windowDataManager.partition(totalCount, deletedOperators);
    KryoCloneUtils<AbstractFileInputOperator<T>> cloneUtils = KryoCloneUtils.createCloneUtils(this);
    for (int i = 0; i < scanners.size(); i++) {
        @SuppressWarnings("unchecked") AbstractFileInputOperator<T> oper = cloneUtils.getClone();
        DirectoryScanner scn = scanners.get(i);
        oper.setScanner(scn);
        // Do state transfer for processed files.
        oper.processedFiles.addAll(totalProcessedFiles);
        oper.globalNumberOfFailures = tempGlobalNumberOfRetries;
        oper.localNumberOfFailures.setValue(0);
        oper.globalNumberOfRetries = tempGlobalNumberOfFailures;
        oper.localNumberOfRetries.setValue(0);
        /* redistribute unfinished files properly */
        oper.unfinishedFiles.clear();
        oper.currentFile = null;
        oper.offset = 0;
        Iterator<FailedFile> unfinishedIter = currentFiles.iterator();
        while (unfinishedIter.hasNext()) {
            FailedFile unfinishedFile = unfinishedIter.next();
            if (scn.acceptFile(unfinishedFile.path)) {
                oper.unfinishedFiles.add(unfinishedFile);
                unfinishedIter.remove();
            }
        }
        /* transfer failed files */
        oper.failedFiles.clear();
        Iterator<FailedFile> iter = totalFailedFiles.iterator();
        while (iter.hasNext()) {
            FailedFile ff = iter.next();
            if (scn.acceptFile(ff.path)) {
                oper.failedFiles.add(ff);
                iter.remove();
            }
        }
        /* redistribute pending files properly */
        oper.pendingFiles.clear();
        Iterator<String> pendingFilesIterator = totalPendingFiles.iterator();
        while (pendingFilesIterator.hasNext()) {
            String pathString = pendingFilesIterator.next();
            if (scn.acceptFile(pathString)) {
                oper.pendingFiles.add(pathString);
                pendingFilesIterator.remove();
            }
        }
        oper.setWindowDataManager(newManagers.get(i));
        newPartitions.add(new DefaultPartition<AbstractFileInputOperator<T>>(oper));
    }
    LOG.info("definePartitions called returning {} partitions", newPartitions.size());
    return newPartitions;
}
Also used : DefaultPartition(com.datatorrent.api.DefaultPartition) MutableLong(org.apache.commons.lang.mutable.MutableLong) WindowDataManager(org.apache.apex.malhar.lib.wal.WindowDataManager)

Example 3 with WindowDataManager

use of org.apache.apex.malhar.lib.wal.WindowDataManager in project apex-malhar by apache.

the class AbstractKafkaInputOperator method definePartitions.

@Override
public Collection<Partitioner.Partition<AbstractKafkaInputOperator<K>>> definePartitions(Collection<Partitioner.Partition<AbstractKafkaInputOperator<K>>> partitions, Partitioner.PartitioningContext context) {
    // Initialize brokers from zookeepers
    getConsumer().initBrokers();
    boolean isInitialParitition = true;
    // check if it's the initial partition
    if (partitions.iterator().hasNext()) {
        isInitialParitition = partitions.iterator().next().getStats() == null;
    }
    // Operator partitions
    List<Partitioner.Partition<AbstractKafkaInputOperator<K>>> newPartitions = null;
    // initialize the offset
    Map<KafkaPartition, Long> initOffset = null;
    if (isInitialParitition && offsetManager != null) {
        initOffset = offsetManager.loadInitialOffsets();
        logger.info("Initial offsets: {} ", "{ " + Joiner.on(", ").useForNull("").withKeyValueSeparator(": ").join(initOffset) + " }");
    }
    Set<Integer> deletedOperators = Sets.newHashSet();
    Collection<Partition<AbstractKafkaInputOperator<K>>> resultPartitions = partitions;
    boolean numPartitionsChanged = false;
    switch(strategy) {
        // Each operator partition will consume from only one kafka partition
        case ONE_TO_ONE:
            if (isInitialParitition) {
                lastRepartitionTime = System.currentTimeMillis();
                logger.info("[ONE_TO_ONE]: Initializing partition(s)");
                // get partition metadata for topics.
                // Whatever operator is using high-level or simple kafka consumer, the operator always create a temporary simple kafka consumer to get the metadata of the topic
                // The initial value of brokerList of the KafkaConsumer is used to retrieve the topic metadata
                Map<String, List<PartitionMetadata>> kafkaPartitions = KafkaMetadataUtil.getPartitionsForTopic(getConsumer().brokers, getConsumer().getTopic());
                // initialize the number of operator partitions according to number of kafka partitions
                newPartitions = new LinkedList<Partitioner.Partition<AbstractKafkaInputOperator<K>>>();
                for (Map.Entry<String, List<PartitionMetadata>> kp : kafkaPartitions.entrySet()) {
                    String clusterId = kp.getKey();
                    for (PartitionMetadata pm : kp.getValue()) {
                        logger.info("[ONE_TO_ONE]: Create operator partition for cluster {}, topic {}, kafka partition {} ", clusterId, getConsumer().topic, pm.partitionId());
                        newPartitions.add(createPartition(Sets.newHashSet(new KafkaPartition(clusterId, consumer.topic, pm.partitionId())), initOffset));
                    }
                }
                resultPartitions = newPartitions;
                numPartitionsChanged = true;
            } else if (newWaitingPartition.size() != 0) {
                // add partition for new kafka partition
                for (KafkaPartition newPartition : newWaitingPartition) {
                    logger.info("[ONE_TO_ONE]: Add operator partition for cluster {}, topic {}, partition {}", newPartition.getClusterId(), getConsumer().topic, newPartition.getPartitionId());
                    partitions.add(createPartition(Sets.newHashSet(newPartition), null));
                }
                newWaitingPartition.clear();
                resultPartitions = partitions;
                numPartitionsChanged = true;
            }
            break;
        // and guarantee the total intake rate for each operator partition is below some threshold
        case ONE_TO_MANY:
            if (getConsumer() instanceof HighlevelKafkaConsumer) {
                throw new UnsupportedOperationException("[ONE_TO_MANY]: The high-level consumer is not supported for ONE_TO_MANY partition strategy.");
            }
            if (isInitialParitition || newWaitingPartition.size() != 0) {
                lastRepartitionTime = System.currentTimeMillis();
                logger.info("[ONE_TO_MANY]: Initializing partition(s)");
                // get partition metadata for topics.
                // Whatever operator is using high-level or simple kafka consumer, the operator always create a temporary simple kafka consumer to get the metadata of the topic
                // The initial value of brokerList of the KafkaConsumer is used to retrieve the topic metadata
                Map<String, List<PartitionMetadata>> kafkaPartitions = KafkaMetadataUtil.getPartitionsForTopic(getConsumer().brokers, getConsumer().getTopic());
                int size = initialPartitionCount;
                @SuppressWarnings("unchecked") Set<KafkaPartition>[] kps = (Set<KafkaPartition>[]) Array.newInstance((new HashSet<KafkaPartition>()).getClass(), size);
                int i = 0;
                for (Map.Entry<String, List<PartitionMetadata>> en : kafkaPartitions.entrySet()) {
                    String clusterId = en.getKey();
                    for (PartitionMetadata pm : en.getValue()) {
                        if (kps[i % size] == null) {
                            kps[i % size] = new HashSet<KafkaPartition>();
                        }
                        kps[i % size].add(new KafkaPartition(clusterId, consumer.topic, pm.partitionId()));
                        i++;
                    }
                }
                size = i > size ? size : i;
                newPartitions = new ArrayList<Partitioner.Partition<AbstractKafkaInputOperator<K>>>(size);
                for (i = 0; i < size; i++) {
                    logger.info("[ONE_TO_MANY]: Create operator partition for kafka partition(s): {} ", StringUtils.join(kps[i], ", "));
                    newPartitions.add(createPartition(kps[i], initOffset));
                }
                // Add the existing partition Ids to the deleted operators
                for (Partition<AbstractKafkaInputOperator<K>> op : partitions) {
                    deletedOperators.add(op.getPartitionedInstance().operatorId);
                }
                newWaitingPartition.clear();
                resultPartitions = newPartitions;
                numPartitionsChanged = true;
            }
            break;
        case ONE_TO_MANY_HEURISTIC:
            throw new UnsupportedOperationException("[ONE_TO_MANY_HEURISTIC]: Not implemented yet");
        default:
            break;
    }
    if (numPartitionsChanged) {
        List<WindowDataManager> managers = windowDataManager.partition(resultPartitions.size(), deletedOperators);
        int i = 0;
        for (Partition<AbstractKafkaInputOperator<K>> partition : resultPartitions) {
            partition.getPartitionedInstance().setWindowDataManager(managers.get(i++));
        }
    }
    return resultPartitions;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) HashSet(java.util.HashSet) DefaultPartition(com.datatorrent.api.DefaultPartition) WindowDataManager(org.apache.apex.malhar.lib.wal.WindowDataManager) PartitionMetadata(kafka.javaapi.PartitionMetadata) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with WindowDataManager

use of org.apache.apex.malhar.lib.wal.WindowDataManager in project apex-malhar by apache.

the class TestNiFiOutputApplication method populateDAG.

@Override
public void populateDAG(DAG dag, Configuration conf) {
    final SiteToSiteClientConfig clientConfig = new SiteToSiteClient.Builder().url("http://localhost:8080/nifi").portName("Apex").buildConfig();
    final int batchSize = 1;
    final SiteToSiteClient.Builder builder = new SiteToSiteClient.Builder().fromConfig(clientConfig);
    final NiFiDataPacketBuilder<String> dataPacketBuilder = new StringNiFiDataPacketBuilder();
    final WindowDataManager windowDataManager = new WindowDataManager.NoopWindowDataManager();
    RandomEventGenerator rand = dag.addOperator("rand", new RandomEventGenerator());
    NiFiSinglePortOutputOperator nifi = dag.addOperator("nifi", new NiFiSinglePortOutputOperator(builder, dataPacketBuilder, windowDataManager, batchSize));
    dag.addStream("rand_nifi", rand.string_data, nifi.inputPort).setLocality(null);
}
Also used : SiteToSiteClient(org.apache.nifi.remote.client.SiteToSiteClient) NiFiSinglePortOutputOperator(org.apache.apex.malhar.contrib.nifi.NiFiSinglePortOutputOperator) SiteToSiteClientConfig(org.apache.nifi.remote.client.SiteToSiteClientConfig) WindowDataManager(org.apache.apex.malhar.lib.wal.WindowDataManager) RandomEventGenerator(org.apache.apex.malhar.lib.testbench.RandomEventGenerator)

Example 5 with WindowDataManager

use of org.apache.apex.malhar.lib.wal.WindowDataManager in project apex-malhar by apache.

the class AbstractKinesisInputOperator method definePartitions.

@Override
public Collection<Partition<AbstractKinesisInputOperator>> definePartitions(Collection<Partition<AbstractKinesisInputOperator>> partitions, PartitioningContext context) {
    boolean isInitialParitition = partitions.iterator().next().getStats() == null;
    // Set the credentials to get the list of shards
    if (isInitialParitition) {
        try {
            KinesisUtil.getInstance().createKinesisClient(accessKey, secretKey, endPoint);
        } catch (Exception e) {
            throw new RuntimeException("[definePartitions]: Unable to load credentials. ", e);
        }
    }
    List<Shard> shards = KinesisUtil.getInstance().getShardList(getStreamName());
    // Operator partitions
    List<Partition<AbstractKinesisInputOperator>> newPartitions = null;
    Set<Integer> deletedOperators = Sets.newHashSet();
    // initialize the shard positions
    Map<String, String> initShardPos = null;
    if (isInitialParitition && shardManager != null) {
        initShardPos = shardManager.loadInitialShardPositions();
    }
    switch(strategy) {
        // Each operator partition will consume from only one kinesis shard
        case ONE_TO_ONE:
            if (isInitialParitition) {
                lastRepartitionTime = System.currentTimeMillis();
                logger.info("[ONE_TO_ONE]: Initializing partition(s)");
                // initialize the number of operator partitions according to number of shards
                newPartitions = new ArrayList<Partition<AbstractKinesisInputOperator>>(shards.size());
                for (int i = 0; i < shards.size(); i++) {
                    logger.info("[ONE_TO_ONE]: Create operator partition for kinesis partition: " + shards.get(i).getShardId() + ", StreamName: " + this.getConsumer().streamName);
                    newPartitions.add(createPartition(Sets.newHashSet(shards.get(i).getShardId()), initShardPos));
                }
            } else if (newWaitingPartition.size() != 0) {
                // Remove the partitions for the closed shards
                removePartitionsForClosedShards(partitions, deletedOperators);
                // add partition for new kinesis shard
                for (String pid : newWaitingPartition) {
                    logger.info("[ONE_TO_ONE]: Add operator partition for kinesis partition " + pid);
                    partitions.add(createPartition(Sets.newHashSet(pid), null));
                }
                newWaitingPartition.clear();
                List<WindowDataManager> managers = windowDataManager.partition(partitions.size(), deletedOperators);
                int i = 0;
                for (Partition<AbstractKinesisInputOperator> partition : partitions) {
                    partition.getPartitionedInstance().setWindowDataManager(managers.get(i));
                    i++;
                }
                return partitions;
            }
            break;
        // Afterwards, the framework will dynamically adjust the partition
        case MANY_TO_ONE:
            /* This case was handled into two ways.
         1. Dynamic Partition: Number of DT partitions is depends on the number of open shards.
         2. Static Partition: Number of DT partitions is fixed, whether the number of shards are increased/decreased.
      */
            int size = initialPartitionCount;
            if (newWaitingPartition.size() != 0) {
                // Get the list of open shards
                shards = getOpenShards(partitions);
                if (shardsPerPartition > 1) {
                    size = (int) Math.ceil(shards.size() / (shardsPerPartition * 1.0));
                }
                initShardPos = shardManager.loadInitialShardPositions();
            }
            @SuppressWarnings("unchecked") Set<String>[] pIds = (Set<String>[]) Array.newInstance((new HashSet<String>()).getClass(), size);
            newPartitions = new ArrayList<Partition<AbstractKinesisInputOperator>>(size);
            for (int i = 0; i < shards.size(); i++) {
                Shard pm = shards.get(i);
                if (pIds[i % size] == null) {
                    pIds[i % size] = new HashSet<String>();
                }
                pIds[i % size].add(pm.getShardId());
            }
            if (isInitialParitition) {
                lastRepartitionTime = System.currentTimeMillis();
                logger.info("[MANY_TO_ONE]: Initializing partition(s)");
            } else {
                logger.info("[MANY_TO_ONE]: Add operator partition for kinesis partition(s): " + StringUtils.join(newWaitingPartition, ", ") + ", StreamName: " + this.getConsumer().streamName);
                newWaitingPartition.clear();
            }
            // Add the existing partition Ids to the deleted operators
            for (Partition<AbstractKinesisInputOperator> op : partitions) {
                deletedOperators.add(op.getPartitionedInstance().operatorId);
            }
            for (int i = 0; i < pIds.length; i++) {
                logger.info("[MANY_TO_ONE]: Create operator partition for kinesis partition(s): " + StringUtils.join(pIds[i], ", ") + ", StreamName: " + this.getConsumer().streamName);
                if (pIds[i] != null) {
                    newPartitions.add(createPartition(pIds[i], initShardPos));
                }
            }
            break;
        default:
            break;
    }
    int i = 0;
    List<WindowDataManager> managers = windowDataManager.partition(partitions.size(), deletedOperators);
    for (Partition<AbstractKinesisInputOperator> partition : partitions) {
        partition.getPartitionedInstance().setWindowDataManager(managers.get(i++));
    }
    return newPartitions;
}
Also used : DefaultPartition(com.datatorrent.api.DefaultPartition) HashSet(java.util.HashSet) Set(java.util.Set) IOException(java.io.IOException) WindowDataManager(org.apache.apex.malhar.lib.wal.WindowDataManager) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) Shard(com.amazonaws.services.kinesis.model.Shard) HashSet(java.util.HashSet)

Aggregations

WindowDataManager (org.apache.apex.malhar.lib.wal.WindowDataManager)5 DefaultPartition (com.datatorrent.api.DefaultPartition)3 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 LinkedList (java.util.LinkedList)2 List (java.util.List)2 Set (java.util.Set)2 SiteToSiteClient (org.apache.nifi.remote.client.SiteToSiteClient)2 SiteToSiteClientConfig (org.apache.nifi.remote.client.SiteToSiteClientConfig)2 Shard (com.amazonaws.services.kinesis.model.Shard)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 PartitionMetadata (kafka.javaapi.PartitionMetadata)1 NiFiSinglePortInputOperator (org.apache.apex.malhar.contrib.nifi.NiFiSinglePortInputOperator)1 NiFiSinglePortOutputOperator (org.apache.apex.malhar.contrib.nifi.NiFiSinglePortOutputOperator)1 ConsoleOutputOperator (org.apache.apex.malhar.lib.io.ConsoleOutputOperator)1 RandomEventGenerator (org.apache.apex.malhar.lib.testbench.RandomEventGenerator)1 MutableLong (org.apache.commons.lang.mutable.MutableLong)1