Search in sources :

Example 1 with WashingtonPostCollection

use of io.anserini.collection.WashingtonPostCollection in project Anserini by castorini.

the class BackgroundLinkingTopicReader method extractArticlePlainText.

// Note that there's code duplication here with the WashingtonPostCollection. We can't just call a method there
// because the version of the code inside WashingtonPostCollection.Document modifies internal state (e.g., "kicker"
// and "caption"). Haven't thought of a good solution for this yet.
private static String extractArticlePlainText(String record) {
    WashingtonPostCollection.Document.WashingtonPostObject wapoObj;
    ObjectMapper mapper = new ObjectMapper();
    try {
        wapoObj = mapper.disable(// Ignore unrecognized properties
        DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES).registerModule(// Deserialize Java 8 Optional: http://www.baeldung.com/jackson-optional
        new Jdk8Module()).readValue(record, WashingtonPostCollection.Document.WashingtonPostObject.class);
    } catch (IOException e) {
        // Something is wrong... abort!
        throw new RuntimeException(e);
    }
    StringBuilder contentBuilder = new StringBuilder();
    contentBuilder.append(wapoObj.getTitle()).append("\n");
    wapoObj.getContents().ifPresent(contents -> {
        for (WashingtonPostCollection.Document.WashingtonPostObject.Content contentObj : contents) {
            if (contentObj == null)
                continue;
            if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
                contentObj.getType().ifPresent(type -> {
                    contentObj.getContent().ifPresent(content -> {
                        if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
                            contentBuilder.append(Jsoup.parse(content).text()).append("\n");
                        }
                    });
                });
            }
            contentObj.getFullCaption().ifPresent(caption -> {
                String fullCaption = contentObj.getFullCaption().get();
                contentBuilder.append(Jsoup.parse(fullCaption).text()).append("\n");
            });
        }
    });
    return contentBuilder.toString();
}
Also used : Jdk8Module(com.fasterxml.jackson.datatype.jdk8.Jdk8Module) WashingtonPostCollection(io.anserini.collection.WashingtonPostCollection) IOException(java.io.IOException) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 2 with WashingtonPostCollection

use of io.anserini.collection.WashingtonPostCollection in project anserini by castorini.

the class BackgroundLinkingTopicReader method extractArticlePlainText.

// Note that there's code duplication here with the WashingtonPostCollection. We can't just call a method there
// because the version of the code inside WashingtonPostCollection.Document modifies internal state (e.g., "kicker"
// and "caption"). Haven't thought of a good solution for this yet.
private static String extractArticlePlainText(String record) {
    WashingtonPostCollection.Document.WashingtonPostObject wapoObj;
    ObjectMapper mapper = new ObjectMapper();
    try {
        wapoObj = mapper.disable(// Ignore unrecognized properties
        DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES).registerModule(// Deserialize Java 8 Optional: http://www.baeldung.com/jackson-optional
        new Jdk8Module()).readValue(record, WashingtonPostCollection.Document.WashingtonPostObject.class);
    } catch (IOException e) {
        // Something is wrong... abort!
        throw new RuntimeException(e);
    }
    StringBuilder contentBuilder = new StringBuilder();
    contentBuilder.append(wapoObj.getTitle()).append("\n");
    wapoObj.getContents().ifPresent(contents -> {
        for (WashingtonPostCollection.Document.WashingtonPostObject.Content contentObj : contents) {
            if (contentObj == null)
                continue;
            if (contentObj.getType().isPresent() && contentObj.getContent().isPresent()) {
                contentObj.getType().ifPresent(type -> {
                    contentObj.getContent().ifPresent(content -> {
                        if (WashingtonPostCollection.Document.CONTENT_TYPE_TAG.contains(type)) {
                            contentBuilder.append(Jsoup.parse(content).text()).append("\n");
                        }
                    });
                });
            }
            contentObj.getFullCaption().ifPresent(caption -> {
                String fullCaption = contentObj.getFullCaption().get();
                contentBuilder.append(Jsoup.parse(fullCaption).text()).append("\n");
            });
        }
    });
    return contentBuilder.toString();
}
Also used : Jdk8Module(com.fasterxml.jackson.datatype.jdk8.Jdk8Module) WashingtonPostCollection(io.anserini.collection.WashingtonPostCollection) IOException(java.io.IOException) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Aggregations

ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 Jdk8Module (com.fasterxml.jackson.datatype.jdk8.Jdk8Module)2 WashingtonPostCollection (io.anserini.collection.WashingtonPostCollection)2 IOException (java.io.IOException)2