use of io.anserini.collection.WashingtonPostCollection in project Anserini by castorini.
the class BackgroundLinkingTopicReader method extractArticlePlainText.
// Note that there's code duplication here with the WashingtonPostCollection. We can't just call a method there
// because the version of the code inside WashingtonPostCollection.Document modifies internal state (e.g., "kicker"
// and "caption"). Haven't thought of a good solution for this yet.
private static String extractArticlePlainText(String record) {
WashingtonPostCollection.Document.WashingtonPostObject wapoObj;
ObjectMapper mapper = new ObjectMapper();
try {
wapoObj = mapper.disable(// Ignore unrecognized properties
DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES).registerModule(// Deserialize Java 8 Optional: http://www.baeldung.com/jackson-optional
new Jdk8Module()).readValue(record, WashingtonPostCollection.Document.WashingtonPostObject.class);
} catch (IOException e) {
// Something is wrong... abort!
throw new RuntimeException(e);
}
StringBuilder contentBuilder = new StringBuilder();
contentBuilder.append(wapoObj.getTitle()).append("\n");
wapoObj.getContents().ifPresent(contents -> {
for (WashingtonPostCollection.Document.WashingtonPostObject.Content contentObj : contents) {
if (contentObj == null)
continue;
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(Jsoup.parse(content).text()).append("\n");
}
});
});
}
contentObj.getFullCaption().ifPresent(caption -> {
String fullCaption = contentObj.getFullCaption().get();
contentBuilder.append(Jsoup.parse(fullCaption).text()).append("\n");
});
}
});
return contentBuilder.toString();
}
use of io.anserini.collection.WashingtonPostCollection in project anserini by castorini.
the class BackgroundLinkingTopicReader method extractArticlePlainText.
// Note that there's code duplication here with the WashingtonPostCollection. We can't just call a method there
// because the version of the code inside WashingtonPostCollection.Document modifies internal state (e.g., "kicker"
// and "caption"). Haven't thought of a good solution for this yet.
private static String extractArticlePlainText(String record) {
WashingtonPostCollection.Document.WashingtonPostObject wapoObj;
ObjectMapper mapper = new ObjectMapper();
try {
wapoObj = mapper.disable(// Ignore unrecognized properties
DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES).registerModule(// Deserialize Java 8 Optional: http://www.baeldung.com/jackson-optional
new Jdk8Module()).readValue(record, WashingtonPostCollection.Document.WashingtonPostObject.class);
} catch (IOException e) {
// Something is wrong... abort!
throw new RuntimeException(e);
}
StringBuilder contentBuilder = new StringBuilder();
contentBuilder.append(wapoObj.getTitle()).append("\n");
wapoObj.getContents().ifPresent(contents -> {
for (WashingtonPostCollection.Document.WashingtonPostObject.Content contentObj : contents) {
if (contentObj == null)
continue;
if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
contentObj.getType().ifPresent(type -> {
contentObj.getContent().ifPresent(content -> {
if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
contentBuilder.append(Jsoup.parse(content).text()).append("\n");
}
});
});
}
contentObj.getFullCaption().ifPresent(caption -> {
String fullCaption = contentObj.getFullCaption().get();
contentBuilder.append(Jsoup.parse(fullCaption).text()).append("\n");
});
}
});
return contentBuilder.toString();
}
Aggregations