use of com.twitter.twittertext.Extractor in project Anserini by castorini.
the class TweetGenerator method createDocument.
@Override
public Document createDocument(TweetCollection.Document tweetDoc) throws GeneratorException {
String id = tweetDoc.id();
if (tweetDoc.contents().trim().isEmpty()) {
throw new EmptyDocumentException();
}
final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.contents().trim());
if (!result.isValid) {
throw new InvalidDocumentException();
}
String text = tweetDoc.contents().trim().substring(result.validTextRange.start, result.validTextRange.end);
if (!args.tweetKeepUrls) {
final Extractor extractor = new Extractor();
final List<String> urls = extractor.extractURLs(text);
for (String url : urls) {
text = text.replaceAll(url, "");
}
}
text = text.trim();
if (text.isEmpty()) {
throw new EmptyDocumentException();
}
// Skip deletes tweetids.
if (deletes != null && deletes.contains(id)) {
throw new SkippedDocumentException();
}
if (tweetDoc.getIdLong() > args.tweetMaxId) {
throw new SkippedDocumentException();
}
if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
throw new SkippedDocumentException();
}
Document doc = new Document();
doc.add(new StringField(IndexArgs.ID, id, Field.Store.YES));
// We need this to break scoring ties.
doc.add(new LongPoint(TweetField.ID_LONG.name, tweetDoc.getIdLong()));
doc.add(new NumericDocValuesField(TweetField.ID_LONG.name, tweetDoc.getIdLong()));
tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(TweetField.EPOCH.name, epoch)));
doc.add(new StringField(TweetField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
doc.add(new IntPoint(TweetField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
doc.add(new IntPoint(TweetField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
doc.add(new IntPoint(TweetField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));
tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
doc.add(new LongPoint(TweetField.IN_REPLY_TO_STATUS_ID.name, rid));
tweetDoc.getInReplyToUserId().ifPresent(ruid -> doc.add(new LongPoint(TweetField.IN_REPLY_TO_USER_ID.name, ruid)));
});
tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
doc.add(new LongPoint(TweetField.RETWEETED_STATUS_ID.name, rid));
tweetDoc.getRetweetedUserId().ifPresent(ruid -> doc.add(new LongPoint(TweetField.RETWEETED_USER_ID.name, ruid)));
tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(TweetField.RETWEET_COUNT.name, rc)));
});
tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(TweetField.LANG.name, lang, Field.Store.NO)));
if (args.storeRaw) {
// store the raw json string as one single field
doc.add(new StoredField(IndexArgs.RAW, tweetDoc.getJsonString()));
}
FieldType fieldType = new FieldType();
fieldType.setStored(args.storeContents);
// Are we storing document vectors?
if (args.storeDocvectors) {
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
}
// Are we building a "positional" or "count" index?
if (args.storePositions) {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
} else {
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
}
doc.add(new Field(IndexArgs.CONTENTS, text, fieldType));
return doc;
}
Aggregations