use of io.anserini.document.twitter.Status in project Anserini by castorini.
the class TweetStreamIndexer method run.
@Override
public void run() {
tweetCount = 0;
final FieldType textOptions = new FieldType();
// textOptions.setIndexed(true);
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
TwitterStream twitterStream = new TwitterStreamFactory().getInstance();
RawStreamListener rawListener = new RawStreamListener() {
@Override
public void onMessage(String rawString) {
Status status = Status.fromJson(rawString);
if (status == null) {
try {
JsonObject obj = (JsonObject) JSON_PARSER.parse(rawString);
if (obj.has("delete")) {
long id = obj.getAsJsonObject("delete").getAsJsonObject("status").get("id").getAsLong();
Query q = LongPoint.newRangeQuery(StatusField.ID.name, id, id);
TweetSearcher.indexWriter.deleteDocuments(q);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
if (status.getText() == null) {
return;
}
Document doc = new Document();
doc.add(new LongPoint(StatusField.ID.name, status.getId()));
doc.add(new StoredField(StatusField.ID.name, status.getId()));
doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));
doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
doc.add(new StoredField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
doc.add(new StoredField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
doc.add(new StoredField(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
long inReplyToStatusId = status.getInReplyToStatusId();
if (inReplyToStatusId > 0) {
doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
doc.add(new StoredField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
doc.add(new StoredField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
}
String lang = status.getLang();
if (!lang.equals("unknown")) {
doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
}
long retweetStatusId = status.getRetweetedStatusId();
if (retweetStatusId > 0) {
doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
doc.add(new StoredField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
doc.add(new StoredField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
System.err.println("Error parsing retweet fields of " + status.getId());
}
}
try {
TweetSearcher.indexWriter.addDocument(doc);
tweetCount++;
if (tweetCount % 1000 == 0) {
LOG.info(tweetCount + " statuses indexed");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void onException(Exception e) {
// TODO Auto-generated method stub
e.printStackTrace();
}
};
twitterStream.addListener(rawListener);
twitterStream.sample();
}
use of io.anserini.document.twitter.Status in project Anserini by castorini.
the class TRECIndexerRunnable method run.
@Override
public void run() {
final FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
twitterStream = new TwitterStreamFactory().getInstance();
RawStreamListener rawListener = new RawStreamListener() {
@Override
public void onMessage(String rawString) {
Status status = Status.fromJson(rawString);
// TREC 2016 rule: Treatment of retweets.
if (status.getRetweetStatusString() != null) {
status = Status.fromJson(status.getRetweetStatusString());
}
if (status == null) {
try {
JsonObject obj = (JsonObject) JSON_PARSER.parse(rawString);
// Tweet deletion update: delete from the existed index
if (obj.has("delete")) {
long id = obj.getAsJsonObject("delete").getAsJsonObject("status").get("id").getAsLong();
Query q = LongPoint.newRangeQuery(StatusField.ID.name, id, id);
indexWriter.deleteDocuments(q);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return;
}
if (status.getText() == null) {
return;
}
// filter out retweets
if (status.getText().substring(0, 4).equals("RT @")) {
return;
}
// concatenate with whitespace
if (!status.getLang().equals("en")) {
return;
}
String rawText = status.getText();
String processedRawText = rawText.replaceAll("[^\\x00-\\x7F]", "");
if (processedRawText == null) {
return;
}
String whiteSpaceTokenizedText = TRECTwokenizer.trecTokenizeText(processedRawText);
if (whiteSpaceTokenizedText == "") {
return;
}
Document doc = new Document();
doc.add(new LongPoint(StatusField.ID.name, status.getId()));
doc.add(new StoredField(StatusField.ID.name, status.getId()));
doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
doc.add(new TextField(StatusField.NAME.name, status.getName(), Store.YES));
doc.add(new TextField(StatusField.PROFILE_IMAGE_URL.name, status.getProfileImageURL(), Store.YES));
doc.add(new Field(StatusField.TEXT.name, whiteSpaceTokenizedText, textOptions));
doc.add(new TextField(StatusField.RAW_TEXT.name, status.getText(), Store.YES));
long retweetStatusId = status.getRetweetedStatusId();
if (retweetStatusId > 0) {
doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
System.err.println("Error parsing retweet fields of " + status.getId());
}
}
try {
indexWriter.addDocument(doc);
indexWriter.commit();
tweetCount++;
if (tweetCount % 1000 == 0) {
LOG.info(tweetCount + " statuses indexed");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
@Override
public void onException(Exception e) {
// TODO Auto-generated method stub
e.printStackTrace();
}
};
twitterStream.addListener(rawListener);
twitterStream.sample();
}
use of io.anserini.document.twitter.Status in project Anserini by castorini.
the class TweetSearch method addTestTweet.
protected Document addTestTweet(String jsonTweetStr) throws IOException {
Status status = Status.fromJson(jsonTweetStr);
System.out.println(status);
final FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
textOptions.setStoreTermVectors(true);
Document doc = new Document();
doc.add(new LongPoint(IndexTweets.StatusField.ID.name, status.getId()));
doc.add(new StoredField(IndexTweets.StatusField.ID.name, status.getId()));
doc.add(new LongPoint(IndexTweets.StatusField.EPOCH.name, status.getEpoch()));
doc.add(new StoredField(IndexTweets.StatusField.EPOCH.name, status.getEpoch()));
doc.add(new TextField(IndexTweets.StatusField.SCREEN_NAME.name, status.getScreenname(), Field.Store.YES));
doc.add(new Field(IndexTweets.StatusField.TEXT.name, status.getText(), textOptions));
doc.add(new IntPoint(IndexTweets.StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
doc.add(new StoredField(IndexTweets.StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
doc.add(new IntPoint(IndexTweets.StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
doc.add(new StoredField(IndexTweets.StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
doc.add(new IntPoint(IndexTweets.StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
doc.add(new StoredField(IndexTweets.StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
long inReplyToStatusId = status.getInReplyToStatusId();
if (inReplyToStatusId > 0) {
doc.add(new LongPoint(IndexTweets.StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
doc.add(new StoredField(IndexTweets.StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
doc.add(new LongPoint(IndexTweets.StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
doc.add(new StoredField(IndexTweets.StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
}
String lang = status.getLang();
if (!lang.equals("unknown")) {
doc.add(new TextField(IndexTweets.StatusField.LANG.name, status.getLang(), Field.Store.YES));
}
long retweetStatusId = status.getRetweetedStatusId();
if (retweetStatusId > 0) {
doc.add(new LongPoint(IndexTweets.StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
doc.add(new StoredField(IndexTweets.StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
doc.add(new LongPoint(IndexTweets.StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
doc.add(new StoredField(IndexTweets.StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
doc.add(new IntPoint(IndexTweets.StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
doc.add(new StoredField(IndexTweets.StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
}
testWriter.addDocument(doc);
testWriter.commit();
return doc;
}
use of io.anserini.document.twitter.Status in project Anserini by castorini.
the class IndexTweets method main.
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
Options options = new Options();
options.addOption(new Option(HELP_OPTION, "show help"));
options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));
options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory").create(COLLECTION_OPTION));
options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids").create(DELETES_OPTION));
options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
System.exit(-1);
}
if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(IndexTweets.class.getName(), options);
System.exit(-1);
}
String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
String indexPath = cmdline.getOptionValue(INDEX_OPTION);
final FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
textOptions.setStoreTermVectors(true);
}
LOG.info("collection: " + collectionPath);
LOG.info("index: " + indexPath);
LongOpenHashSet deletes = null;
if (cmdline.hasOption(DELETES_OPTION)) {
deletes = new LongOpenHashSet();
File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
if (!deletesFile.exists()) {
System.err.println("Error: " + deletesFile + " does not exist!");
System.exit(-1);
}
LOG.info("Reading deletes from " + deletesFile);
FileInputStream fin = new FileInputStream(deletesFile);
byte[] ignoreBytes = new byte[2];
// "B", "Z" bytes from commandline tools
fin.read(ignoreBytes);
BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));
String s;
while ((s = br.readLine()) != null) {
if (s.contains("\t")) {
deletes.add(Long.parseLong(s.split("\t")[0]));
} else {
deletes.add(Long.parseLong(s));
}
}
br.close();
fin.close();
LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
}
long maxId = Long.MAX_VALUE;
if (cmdline.hasOption(MAX_ID_OPTION)) {
maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
LOG.info("index: " + maxId);
}
long startTime = System.currentTimeMillis();
File file = new File(collectionPath);
if (!file.exists()) {
System.err.println("Error: " + file + " does not exist!");
System.exit(-1);
}
StatusStream stream = new JsonStatusCorpusReader(file);
Directory dir = FSDirectory.open(Paths.get(indexPath));
final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
int cnt = 0;
Status status;
try {
while ((status = stream.next()) != null) {
if (status.getText() == null) {
continue;
}
// Skip deletes tweetids.
if (deletes != null && deletes.contains(status.getId())) {
continue;
}
if (status.getId() > maxId) {
continue;
}
cnt++;
Document doc = new Document();
doc.add(new LongPoint(StatusField.ID.name, status.getId()));
doc.add(new StoredField(StatusField.ID.name, status.getId()));
doc.add(new LongPoint(StatusField.EPOCH.name, status.getEpoch()));
doc.add(new StoredField(StatusField.EPOCH.name, status.getEpoch()));
doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));
doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));
doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
doc.add(new StoredField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount()));
doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
doc.add(new StoredField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount()));
doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
doc.add(new StoredField(StatusField.STATUSES_COUNT.name, status.getStatusesCount()));
long inReplyToStatusId = status.getInReplyToStatusId();
if (inReplyToStatusId > 0) {
doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
doc.add(new StoredField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId));
doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
doc.add(new StoredField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId()));
}
String lang = status.getLang();
if (!lang.equals("unknown")) {
doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
}
long retweetStatusId = status.getRetweetedStatusId();
if (retweetStatusId > 0) {
doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
doc.add(new StoredField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId));
doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
doc.add(new StoredField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId()));
doc.add(new IntPoint(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
doc.add(new StoredField(StatusField.RETWEET_COUNT.name, status.getRetweetCount()));
if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
LOG.warn("Error parsing retweet fields of " + status.getId());
}
}
writer.addDocument(doc);
if (cnt % 100000 == 0) {
LOG.info(cnt + " statuses indexed");
}
}
LOG.info(String.format("Total of %s statuses added", cnt));
if (cmdline.hasOption(OPTIMIZE_OPTION)) {
LOG.info("Merging segments...");
writer.forceMerge(1);
LOG.info("Done!");
}
LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
} catch (Exception e) {
e.printStackTrace();
} finally {
writer.close();
dir.close();
stream.close();
}
}
Aggregations