Search in sources :

Example 1 with Status

use of io.anserini.document.twitter.Status in project Anserini by castorini.

the class TweetStreamIndexer method run.

@Override
public void run() {
    tweetCount = 0;
    final FieldType textOptions = new FieldType();
    // textOptions.setIndexed(true);
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    TwitterStream twitterStream = new TwitterStreamFactory().getInstance();
    RawStreamListener rawListener = new RawStreamListener() {

        @Override
        public void onMessage(String rawString) {
            Status status = Status.fromJson(rawString);
            if (status == null) {
                try {
                    JsonObject obj = (JsonObject) JSON_PARSER.parse(rawString);
                    if (obj.has("delete")) {
                        long id = obj.getAsJsonObject("delete").getAsJsonObject("status").get("id").getAsLong();
                        Query q = LongPoint.newRangeQuery(StatusField.ID.name, id, id);
                        TweetSearcher.indexWriter.deleteDocuments(q);
                    }
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                return;
            }
            if (status.getText() == null) {
                return;
            }
            Document doc = new Document();
            doc.add(new LongPoint(StatusField.ID.name, status.getId()));
            doc.add(new StoredField(StatusField.ID.name, status.getId()));
            doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));
            doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
            doc.add(new StoredField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
            doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
            doc.add(new StoredField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
            doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
            doc.add(new StoredField(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
                doc.add(new StoredField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
                doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
                doc.add(new StoredField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
            }
            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }
            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
                doc.add(new StoredField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
                doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
                doc.add(new StoredField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
                doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    System.err.println("Error parsing retweet fields of " + status.getId());
                }
            }
            try {
                TweetSearcher.indexWriter.addDocument(doc);
                tweetCount++;
                if (tweetCount % 1000 == 0) {
                    LOG.info(tweetCount + " statuses indexed");
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        @Override
        public void onException(Exception e) {
            // TODO Auto-generated method stub
            e.printStackTrace();
        }
    };
    twitterStream.addListener(rawListener);
    twitterStream.sample();
}
Also used : Status(io.anserini.document.twitter.Status) RawStreamListener(twitter4j.RawStreamListener) Query(org.apache.lucene.search.Query) JsonObject(com.google.gson.JsonObject) IOException(java.io.IOException) IOException(java.io.IOException) TwitterStreamFactory(twitter4j.TwitterStreamFactory) TwitterStream(twitter4j.TwitterStream)

Example 2 with Status

use of io.anserini.document.twitter.Status in project Anserini by castorini.

the class TRECIndexerRunnable method run.

@Override
public void run() {
    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    twitterStream = new TwitterStreamFactory().getInstance();
    RawStreamListener rawListener = new RawStreamListener() {

        @Override
        public void onMessage(String rawString) {
            Status status = Status.fromJson(rawString);
            // TREC 2016 rule: Treatment of retweets.
            if (status.getRetweetStatusString() != null) {
                status = Status.fromJson(status.getRetweetStatusString());
            }
            if (status == null) {
                try {
                    JsonObject obj = (JsonObject) JSON_PARSER.parse(rawString);
                    // Tweet deletion update: delete from the existed index
                    if (obj.has("delete")) {
                        long id = obj.getAsJsonObject("delete").getAsJsonObject("status").get("id").getAsLong();
                        Query q = LongPoint.newRangeQuery(StatusField.ID.name, id, id);
                        indexWriter.deleteDocuments(q);
                    }
                } catch (Exception e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
                return;
            }
            if (status.getText() == null) {
                return;
            }
            // filter out retweets
            if (status.getText().substring(0, 4).equals("RT @")) {
                return;
            }
            // concatenate with whitespace
            if (!status.getLang().equals("en")) {
                return;
            }
            String rawText = status.getText();
            String processedRawText = rawText.replaceAll("[^\\x00-\\x7F]", "");
            if (processedRawText == null) {
                return;
            }
            String whiteSpaceTokenizedText = TRECTwokenizer.trecTokenizeText(processedRawText);
            if (whiteSpaceTokenizedText == "") {
                return;
            }
            Document doc = new Document();
            doc.add(new LongPoint(StatusField.ID.name, status.getId()));
            doc.add(new StoredField(StatusField.ID.name, status.getId()));
            doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
            doc.add(new TextField(StatusField.NAME.name, status.getName(), Store.YES));
            doc.add(new TextField(StatusField.PROFILE_IMAGE_URL.name, status.getProfileImageURL(), Store.YES));
            doc.add(new Field(StatusField.TEXT.name, whiteSpaceTokenizedText, textOptions));
            doc.add(new TextField(StatusField.RAW_TEXT.name, status.getText(), Store.YES));
            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    System.err.println("Error parsing retweet fields of " + status.getId());
                }
            }
            try {
                indexWriter.addDocument(doc);
                indexWriter.commit();
                tweetCount++;
                if (tweetCount % 1000 == 0) {
                    LOG.info(tweetCount + " statuses indexed");
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }

        @Override
        public void onException(Exception e) {
            // TODO Auto-generated method stub
            e.printStackTrace();
        }
    };
    twitterStream.addListener(rawListener);
    twitterStream.sample();
}
Also used : Status(io.anserini.document.twitter.Status) RawStreamListener(twitter4j.RawStreamListener) Query(org.apache.lucene.search.Query) JsonObject(com.google.gson.JsonObject) IOException(java.io.IOException) IOException(java.io.IOException) TwitterStreamFactory(twitter4j.TwitterStreamFactory)

Example 3 with Status

use of io.anserini.document.twitter.Status in project Anserini by castorini.

the class TweetSearch method addTestTweet.

protected Document addTestTweet(String jsonTweetStr) throws IOException {
    Status status = Status.fromJson(jsonTweetStr);
    System.out.println(status);
    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);
    Document doc = new Document();
    doc.add(new LongPoint(IndexTweets.StatusField.ID.name, status.getId()));
    doc.add(new StoredField(IndexTweets.StatusField.ID.name, status.getId()));
    doc.add(new LongPoint(IndexTweets.StatusField.EPOCH.name, status.getEpoch()));
    doc.add(new StoredField(IndexTweets.StatusField.EPOCH.name, status.getEpoch()));
    doc.add(new TextField(IndexTweets.StatusField.SCREEN_NAME.name, status.getScreenname(), Field.Store.YES));
    doc.add(new Field(IndexTweets.StatusField.TEXT.name, status.getText(), textOptions));
    doc.add(new IntPoint(IndexTweets.StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
    doc.add(new StoredField(IndexTweets.StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
    doc.add(new IntPoint(IndexTweets.StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
    doc.add(new StoredField(IndexTweets.StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
    doc.add(new IntPoint(IndexTweets.StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
    doc.add(new StoredField(IndexTweets.StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
    long inReplyToStatusId = status.getInReplyToStatusId();
    if (inReplyToStatusId > 0) {
        doc.add(new LongPoint(IndexTweets.StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
        doc.add(new StoredField(IndexTweets.StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
        doc.add(new LongPoint(IndexTweets.StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
        doc.add(new StoredField(IndexTweets.StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
    }
    String lang = status.getLang();
    if (!lang.equals("unknown")) {
        doc.add(new TextField(IndexTweets.StatusField.LANG.name, status.getLang(), Field.Store.YES));
    }
    long retweetStatusId = status.getRetweetedStatusId();
    if (retweetStatusId > 0) {
        doc.add(new LongPoint(IndexTweets.StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
        doc.add(new StoredField(IndexTweets.StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
        doc.add(new LongPoint(IndexTweets.StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
        doc.add(new StoredField(IndexTweets.StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
        doc.add(new IntPoint(IndexTweets.StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
        doc.add(new StoredField(IndexTweets.StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
    }
    testWriter.addDocument(doc);
    testWriter.commit();
    return doc;
}
Also used : Status(io.anserini.document.twitter.Status)

Example 4 with Status

use of io.anserini.document.twitter.Status in project Anserini by castorini.

the class IndexTweets method main.

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));
    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory").create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids").create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));
    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }
    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweets.class.getName(), options);
        System.exit(-1);
    }
    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);
    }
    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);
    LongOpenHashSet deletes = null;
    if (cmdline.hasOption(DELETES_OPTION)) {
        deletes = new LongOpenHashSet();
        File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
        if (!deletesFile.exists()) {
            System.err.println("Error: " + deletesFile + " does not exist!");
            System.exit(-1);
        }
        LOG.info("Reading deletes from " + deletesFile);
        FileInputStream fin = new FileInputStream(deletesFile);
        byte[] ignoreBytes = new byte[2];
        // "B", "Z" bytes from commandline tools
        fin.read(ignoreBytes);
        BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));
        String s;
        while ((s = br.readLine()) != null) {
            if (s.contains("\t")) {
                deletes.add(Long.parseLong(s.split("\t")[0]));
            } else {
                deletes.add(Long.parseLong(s));
            }
        }
        br.close();
        fin.close();
        LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
    }
    long maxId = Long.MAX_VALUE;
    if (cmdline.hasOption(MAX_ID_OPTION)) {
        maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
        LOG.info("index: " + maxId);
    }
    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }
    StatusStream stream = new JsonStatusCorpusReader(file);
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {
            if (status.getText() == null) {
                continue;
            }
            // Skip deletes tweetids.
            if (deletes != null && deletes.contains(status.getId())) {
                continue;
            }
            if (status.getId() > maxId) {
                continue;
            }
            cnt++;
            Document doc = new Document();
            doc.add(new LongPoint(StatusField.ID.name, status.getId()));
            doc.add(new StoredField(StatusField.ID.name, status.getId()));
            doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));
            doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
            doc.add(new StoredField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
            doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
            doc.add(new StoredField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
            doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
            doc.add(new StoredField(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
                doc.add(new StoredField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
                doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
                doc.add(new StoredField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
            }
            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }
            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
                doc.add(new StoredField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
                doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
                doc.add(new StoredField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
                doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    LOG.warn("Error parsing retweet fields of " + status.getId());
                }
            }
            writer.addDocument(doc);
            if (cnt % 100000 == 0) {
                LOG.info(cnt + " statuses indexed");
            }
        }
        LOG.info(String.format("Total of %s statuses added", cnt));
        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }
        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}
Also used : IndexOptions(org.apache.lucene.index.IndexOptions) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) StatusStream(io.anserini.document.twitter.StatusStream) JsonStatusCorpusReader(io.anserini.document.twitter.JsonStatusCorpusReader) LongOpenHashSet(it.unimi.dsi.fastutil.longs.LongOpenHashSet) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Status(io.anserini.document.twitter.Status) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream) IndexWriter(org.apache.lucene.index.IndexWriter) BufferedReader(java.io.BufferedReader) File(java.io.File) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

Status (io.anserini.document.twitter.Status)4 JsonObject (com.google.gson.JsonObject)2 IOException (java.io.IOException)2 Query (org.apache.lucene.search.Query)2 RawStreamListener (twitter4j.RawStreamListener)2 TwitterStreamFactory (twitter4j.TwitterStreamFactory)2 JsonStatusCorpusReader (io.anserini.document.twitter.JsonStatusCorpusReader)1 StatusStream (io.anserini.document.twitter.StatusStream)1 LongOpenHashSet (it.unimi.dsi.fastutil.longs.LongOpenHashSet)1 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 IndexOptions (org.apache.lucene.index.IndexOptions)1 IndexWriter (org.apache.lucene.index.IndexWriter)1 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)1 Directory (org.apache.lucene.store.Directory)1 FSDirectory (org.apache.lucene.store.FSDirectory)1 CBZip2InputStream (org.apache.tools.bzip2.CBZip2InputStream)1 TwitterStream (twitter4j.TwitterStream)1