Search in sources :

Example 16 with Firehose

use of io.druid.data.input.Firehose in project druid by druid-io.

the class RabbitMQFirehoseFactory method connect.

@Override
public Firehose connect(final ByteBufferInputRowParser firehoseParser) throws IOException {
    ConnectionOptions lyraOptions = new ConnectionOptions(this.connectionFactory);
    Config lyraConfig = new Config().withRecoveryPolicy(new RetryPolicy().withMaxRetries(config.getMaxRetries()).withRetryInterval(Duration.seconds(config.getRetryIntervalSeconds())).withMaxDuration(Duration.seconds(config.getMaxDurationSeconds())));
    String queue = config.getQueue();
    String exchange = config.getExchange();
    String routingKey = config.getRoutingKey();
    boolean durable = config.isDurable();
    boolean exclusive = config.isExclusive();
    boolean autoDelete = config.isAutoDelete();
    final Connection connection = Connections.create(lyraOptions, lyraConfig);
    connection.addShutdownListener(new ShutdownListener() {

        @Override
        public void shutdownCompleted(ShutdownSignalException cause) {
            log.warn(cause, "Connection closed!");
        }
    });
    final Channel channel = connection.createChannel();
    channel.queueDeclare(queue, durable, exclusive, autoDelete, null);
    channel.queueBind(queue, exchange, routingKey);
    channel.addShutdownListener(new ShutdownListener() {

        @Override
        public void shutdownCompleted(ShutdownSignalException cause) {
            log.warn(cause, "Channel closed!");
        }
    });
    // We create a QueueingConsumer that will not auto-acknowledge messages since that
    // happens on commit().
    final QueueingConsumer consumer = new QueueingConsumer(channel);
    channel.basicConsume(queue, false, consumer);
    return new Firehose() {

        /**
       * Storing the latest delivery as a member variable should be safe since this will only be run
       * by a single thread.
       */
        private Delivery delivery;

        /**
       * Store the latest delivery tag to be able to commit (acknowledge) the message delivery up to
       * and including this tag. See commit() for more detail.
       */
        private long lastDeliveryTag;

        @Override
        public boolean hasMore() {
            delivery = null;
            try {
                // Wait for the next delivery. This will block until something is available.
                delivery = consumer.nextDelivery();
                if (delivery != null) {
                    lastDeliveryTag = delivery.getEnvelope().getDeliveryTag();
                    // If delivery is non-null, we report that there is something more to process.
                    return true;
                }
            } catch (InterruptedException e) {
                // A little unclear on how we should handle this.
                // At any rate, we're in an unknown state now so let's log something and return false.
                log.wtf(e, "Got interrupted while waiting for next delivery. Doubt this should ever happen.");
            }
            // nothing more to process.
            return false;
        }

        @Override
        public InputRow nextRow() {
            if (delivery == null) {
                //Just making sure.
                log.wtf("I have nothing in delivery. Method hasMore() should have returned false.");
                return null;
            }
            return firehoseParser.parse(ByteBuffer.wrap(delivery.getBody()));
        }

        @Override
        public Runnable commit() {
            // acknowledge values up to and including that value.
            return new Runnable() {

                // Store (copy) the last delivery tag to "become" thread safe.
                final long deliveryTag = lastDeliveryTag;

                @Override
                public void run() {
                    try {
                        log.info("Acknowledging delivery of messages up to tag: " + deliveryTag);
                        // Acknowledge all messages up to and including the stored delivery tag.
                        channel.basicAck(deliveryTag, true);
                    } catch (IOException e) {
                        log.error(e, "Unable to acknowledge message reception to message queue.");
                    }
                }
            };
        }

        @Override
        public void close() throws IOException {
            log.info("Closing connection to RabbitMQ");
            channel.close();
            connection.close();
        }
    };
}
Also used : Config(net.jodah.lyra.config.Config) Firehose(io.druid.data.input.Firehose) Channel(com.rabbitmq.client.Channel) Connection(com.rabbitmq.client.Connection) IOException(java.io.IOException) ShutdownListener(com.rabbitmq.client.ShutdownListener) ShutdownSignalException(com.rabbitmq.client.ShutdownSignalException) ConnectionOptions(net.jodah.lyra.ConnectionOptions) Delivery(com.rabbitmq.client.QueueingConsumer.Delivery) RetryPolicy(net.jodah.lyra.retry.RetryPolicy)

Example 17 with Firehose

use of io.druid.data.input.Firehose in project druid by druid-io.

the class BatchDeltaIngestionTest method testIngestion.

private void testIngestion(HadoopDruidIndexerConfig config, List<ImmutableMap<String, Object>> expectedRowsGenerated, WindowedDataSegment windowedDataSegment) throws Exception {
    IndexGeneratorJob job = new IndexGeneratorJob(config);
    JobHelper.runJobs(ImmutableList.<Jobby>of(job), config);
    File segmentFolder = new File(String.format("%s/%s/%s_%s/%s/0", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), INTERVAL_FULL.getStart().toString(), INTERVAL_FULL.getEnd().toString(), config.getSchema().getTuningConfig().getVersion()));
    Assert.assertTrue(segmentFolder.exists());
    File descriptor = new File(segmentFolder, "descriptor.json");
    File indexZip = new File(segmentFolder, "index.zip");
    Assert.assertTrue(descriptor.exists());
    Assert.assertTrue(indexZip.exists());
    DataSegment dataSegment = MAPPER.readValue(descriptor, DataSegment.class);
    Assert.assertEquals("website", dataSegment.getDataSource());
    Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
    Assert.assertEquals(INTERVAL_FULL, dataSegment.getInterval());
    Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
    Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
    Assert.assertEquals("host", dataSegment.getDimensions().get(0));
    Assert.assertEquals("visited_sum", dataSegment.getMetrics().get(0));
    Assert.assertEquals("unique_hosts", dataSegment.getMetrics().get(1));
    Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());
    HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
    Assert.assertEquals(0, spec.getPartitionNum());
    Assert.assertEquals(1, spec.getPartitions());
    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
    new LocalDataSegmentPuller().getSegmentFiles(dataSegment, tmpUnzippedSegmentDir);
    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
    StorageAdapter adapter = new QueryableIndexStorageAdapter(index);
    Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, windowedDataSegment.getInterval())), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), null, Granularities.NONE);
    List<InputRow> rows = Lists.newArrayList();
    while (firehose.hasMore()) {
        rows.add(firehose.nextRow());
    }
    verifyRows(expectedRowsGenerated, rows);
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) Firehose(io.druid.data.input.Firehose) IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) QueryableIndexStorageAdapter(io.druid.segment.QueryableIndexStorageAdapter) StorageAdapter(io.druid.segment.StorageAdapter) WindowedStorageAdapter(io.druid.segment.realtime.firehose.WindowedStorageAdapter) QueryableIndexStorageAdapter(io.druid.segment.QueryableIndexStorageAdapter) DataSegment(io.druid.timeline.DataSegment) WindowedDataSegment(io.druid.indexer.hadoop.WindowedDataSegment) LocalDataSegmentPuller(io.druid.segment.loading.LocalDataSegmentPuller) QueryableIndex(io.druid.segment.QueryableIndex) InputRow(io.druid.data.input.InputRow) File(java.io.File) WindowedStorageAdapter(io.druid.segment.realtime.firehose.WindowedStorageAdapter)

Example 18 with Firehose

use of io.druid.data.input.Firehose in project druid by druid-io.

the class TwitterSpritzerFirehoseFactory method connect.

@Override
public Firehose connect(InputRowParser parser) throws IOException {
    final ConnectionLifeCycleListener connectionLifeCycleListener = new ConnectionLifeCycleListener() {

        @Override
        public void onConnect() {
            log.info("Connected_to_Twitter");
        }

        @Override
        public void onDisconnect() {
            log.info("Disconnect_from_Twitter");
        }

        /**
       * called before thread gets cleaned up
       */
        @Override
        public void onCleanUp() {
            log.info("Cleanup_twitter_stream");
        }
    };
    // ConnectionLifeCycleListener
    final TwitterStream twitterStream;
    final StatusListener statusListener;
    final int QUEUE_SIZE = 2000;
    /** This queue is used to move twitter events from the twitter4j thread to the druid ingest thread.   */
    final BlockingQueue<Status> queue = new ArrayBlockingQueue<Status>(QUEUE_SIZE);
    final long startMsec = System.currentTimeMillis();
    //
    //   set up Twitter Spritzer
    //
    twitterStream = new TwitterStreamFactory().getInstance();
    twitterStream.addConnectionLifeCycleListener(connectionLifeCycleListener);
    statusListener = new StatusListener() {

        // This is what really gets called to deliver stuff from twitter4j
        @Override
        public void onStatus(Status status) {
            // time to stop?
            if (Thread.currentThread().isInterrupted()) {
                throw new RuntimeException("Interrupted, time to stop");
            }
            try {
                boolean success = queue.offer(status, 15L, TimeUnit.SECONDS);
                if (!success) {
                    log.warn("queue too slow!");
                }
            } catch (InterruptedException e) {
                throw new RuntimeException("InterruptedException", e);
            }
        }

        @Override
        public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
        //log.info("Got a status deletion notice id:" + statusDeletionNotice.getStatusId());
        }

        @Override
        public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
            // This notice will be sent each time a limited stream becomes unlimited.
            // If this number is high and or rapidly increasing, it is an indication that your predicate is too broad, and you should consider a predicate with higher selectivity.
            log.warn("Got track limitation notice:" + numberOfLimitedStatuses);
        }

        @Override
        public void onScrubGeo(long userId, long upToStatusId) {
        //log.info("Got scrub_geo event userId:" + userId + " upToStatusId:" + upToStatusId);
        }

        @Override
        public void onException(Exception ex) {
            ex.printStackTrace();
        }

        @Override
        public void onStallWarning(StallWarning warning) {
            System.out.println("Got stall warning:" + warning);
        }
    };
    twitterStream.addListener(statusListener);
    // creates a generic StatusStream
    twitterStream.sample();
    log.info("returned from sample()");
    return new Firehose() {

        private final Runnable doNothingRunnable = new Runnable() {

            public void run() {
            }
        };

        private long rowCount = 0L;

        private boolean waitIfmax = (getMaxEventCount() < 0L);

        private final Map<String, Object> theMap = new TreeMap<>();

        // DIY json parsing // private final ObjectMapper omapper = new ObjectMapper();
        private boolean maxTimeReached() {
            if (getMaxRunMinutes() <= 0) {
                return false;
            } else {
                return (System.currentTimeMillis() - startMsec) / 60000L >= getMaxRunMinutes();
            }
        }

        private boolean maxCountReached() {
            return getMaxEventCount() >= 0 && rowCount >= getMaxEventCount();
        }

        @Override
        public boolean hasMore() {
            if (maxCountReached() || maxTimeReached()) {
                return waitIfmax;
            } else {
                return true;
            }
        }

        @Override
        public InputRow nextRow() {
            // Interrupted to stop?
            if (Thread.currentThread().isInterrupted()) {
                throw new RuntimeException("Interrupted, time to stop");
            }
            // all done?
            if (maxCountReached() || maxTimeReached()) {
                if (waitIfmax) {
                    // sleep a long time instead of terminating
                    try {
                        log.info("reached limit, sleeping a long time...");
                        Thread.sleep(2000000000L);
                    } catch (InterruptedException e) {
                        throw new RuntimeException("InterruptedException", e);
                    }
                } else {
                // allow this event through, and the next hasMore() call will be false
                }
            }
            if (++rowCount % 1000 == 0) {
                log.info("nextRow() has returned %,d InputRows", rowCount);
            }
            Status status;
            try {
                status = queue.take();
            } catch (InterruptedException e) {
                throw new RuntimeException("InterruptedException", e);
            }
            theMap.clear();
            HashtagEntity[] hts = status.getHashtagEntities();
            String text = status.getText();
            theMap.put("text", (null == text) ? "" : text);
            theMap.put("htags", (hts.length > 0) ? Lists.transform(Arrays.asList(hts), new Function<HashtagEntity, String>() {

                @Nullable
                @Override
                public String apply(HashtagEntity input) {
                    return input.getText();
                }
            }) : ImmutableList.<String>of());
            long[] lcontrobutors = status.getContributors();
            List<String> contributors = new ArrayList<>();
            for (long contrib : lcontrobutors) {
                contributors.add(String.format("%d", contrib));
            }
            theMap.put("contributors", contributors);
            GeoLocation geoLocation = status.getGeoLocation();
            if (null != geoLocation) {
                double lat = status.getGeoLocation().getLatitude();
                double lon = status.getGeoLocation().getLongitude();
                theMap.put("lat", lat);
                theMap.put("lon", lon);
            } else {
                theMap.put("lat", null);
                theMap.put("lon", null);
            }
            if (status.getSource() != null) {
                Matcher m = sourcePattern.matcher(status.getSource());
                theMap.put("source", m.find() ? m.group(1) : status.getSource());
            }
            theMap.put("retweet", status.isRetweet());
            if (status.isRetweet()) {
                Status original = status.getRetweetedStatus();
                theMap.put("retweet_count", original.getRetweetCount());
                User originator = original.getUser();
                theMap.put("originator_screen_name", originator != null ? originator.getScreenName() : "");
                theMap.put("originator_follower_count", originator != null ? originator.getFollowersCount() : "");
                theMap.put("originator_friends_count", originator != null ? originator.getFriendsCount() : "");
                theMap.put("originator_verified", originator != null ? originator.isVerified() : "");
            }
            User user = status.getUser();
            final boolean hasUser = (null != user);
            theMap.put("follower_count", hasUser ? user.getFollowersCount() : 0);
            theMap.put("friends_count", hasUser ? user.getFriendsCount() : 0);
            theMap.put("lang", hasUser ? user.getLang() : "");
            // resolution in seconds, -1 if not available?
            theMap.put("utc_offset", hasUser ? user.getUtcOffset() : -1);
            theMap.put("statuses_count", hasUser ? user.getStatusesCount() : 0);
            theMap.put("user_id", hasUser ? String.format("%d", user.getId()) : "");
            theMap.put("screen_name", hasUser ? user.getScreenName() : "");
            theMap.put("location", hasUser ? user.getLocation() : "");
            theMap.put("verified", hasUser ? user.isVerified() : "");
            theMap.put("ts", status.getCreatedAt().getTime());
            List<String> dimensions = Lists.newArrayList(theMap.keySet());
            return new MapBasedInputRow(status.getCreatedAt().getTime(), dimensions, theMap);
        }

        @Override
        public Runnable commit() {
            // reuse the same object each time
            return doNothingRunnable;
        }

        @Override
        public void close() throws IOException {
            log.info("CLOSE twitterstream");
            // invokes twitterStream.cleanUp()
            twitterStream.shutdown();
        }
    };
}
Also used : User(twitter4j.User) Matcher(java.util.regex.Matcher) ArrayList(java.util.ArrayList) TwitterStreamFactory(twitter4j.TwitterStreamFactory) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) StallWarning(twitter4j.StallWarning) ConnectionLifeCycleListener(twitter4j.ConnectionLifeCycleListener) HashtagEntity(twitter4j.HashtagEntity) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) Status(twitter4j.Status) Firehose(io.druid.data.input.Firehose) IOException(java.io.IOException) TwitterStream(twitter4j.TwitterStream) StatusListener(twitter4j.StatusListener) StatusDeletionNotice(twitter4j.StatusDeletionNotice) GeoLocation(twitter4j.GeoLocation) Map(java.util.Map) TreeMap(java.util.TreeMap) Nullable(javax.annotation.Nullable)

Aggregations

Firehose (io.druid.data.input.Firehose)18 InputRow (io.druid.data.input.InputRow)12 Test (org.junit.Test)8 MapBasedInputRow (io.druid.data.input.MapBasedInputRow)7 IOException (java.io.IOException)7 IAnswer (org.easymock.IAnswer)5 Map (java.util.Map)4 ParseException (io.druid.java.util.common.parsers.ParseException)3 HashBasedNumberedShardSpec (io.druid.timeline.partition.HashBasedNumberedShardSpec)3 List (java.util.List)3 DateTime (org.joda.time.DateTime)3 Interval (org.joda.time.Interval)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 Optional (com.google.common.base.Optional)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ByteBufferInputRowParser (io.druid.data.input.ByteBufferInputRowParser)2 FirehoseFactory (io.druid.data.input.FirehoseFactory)2 RealtimeIOConfig (io.druid.segment.indexing.RealtimeIOConfig)2 GranularitySpec (io.druid.segment.indexing.granularity.GranularitySpec)2 ReplayableFirehoseFactory (io.druid.segment.realtime.firehose.ReplayableFirehoseFactory)2