Search in sources :

Example 1 with Annotations

use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.

the class CorenlpPipeline method processPosClassifier.

/**
 * Part-of-Speech Classification (Maximum entropy) only
 *
 * @param input    the string to annotator
 * @param hash     the input hash code
 * @param language the input language
 */
private Annotations processPosClassifier(String input, String hash, Language language) throws InterruptedException {
    Annotations annotations = new Annotations(hash, getType(), language);
    LOGGER.info("POS-tagging for " + language.toString());
    // Split input into sentences
    final CoreNlpAnnotator<MaxentTagger> nlpAnnotator;
    nlpAnnotator = CoreNlpPosModels.getInstance().get(language);
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(input));
    for (List<HasWord> sentence : sentences) {
        // NlpTag with parts-of-speech
        List<TaggedWord> taggedSentence = nlpAnnotator.annotator.tagSentence(sentence);
        // Feed annotatopn
        for (TaggedWord word : taggedSentence) {
            int begin = word.beginPosition();
            int end = word.endPosition();
            // like line 157 we don't use POS tagging
            String pos = word.tag();
            annotations.add(POS, begin, end);
        }
    }
    return annotations;
}
Also used : HasWord(edu.stanford.nlp.ling.HasWord) Annotations(org.icij.datashare.text.nlp.Annotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) TaggedWord(edu.stanford.nlp.ling.TaggedWord) MaxentTagger(edu.stanford.nlp.tagger.maxent.MaxentTagger) StringReader(java.io.StringReader) Collections.singletonList(java.util.Collections.singletonList)

Example 2 with Annotations

use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.

the class CorenlpPipeline method processPipeline.

/**
 * Process with entire pipelines
 *
 * @param input    the string to annotator
 * @param hash     the input hash code
 * @param language the input language
 * @return
 */
private Annotations processPipeline(String input, String hash, Language language) throws InterruptedException {
    Annotations annotations = new Annotations(hash, getType(), language);
    // CoreNLP annotations data-structure
    edu.stanford.nlp.pipeline.Annotation coreNlpAnnotation = new edu.stanford.nlp.pipeline.Annotation(input);
    LOGGER.info("sentencing ~ tokenizing ~ POS-tagging ~ name-finding for " + language.toString());
    // Sentencize input
    // Tokenize
    // Pos-tag
    // NER
    CoreNlpPipelineModels.getInstance().get(language).annotate(coreNlpAnnotation);
    // Feed annotations
    List<CoreMap> sentences = coreNlpAnnotation.get(SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        int sentenceBegin = sentence.get(CharacterOffsetBeginAnnotation.class);
        int sentenceEnd = sentence.get(CharacterOffsetEndAnnotation.class);
        annotations.add(SENTENCE, sentenceBegin, sentenceEnd);
        int nerBegin = 0;
        NamedEntity.Category prevCat = NamedEntity.Category.NONE;
        List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
        for (CoreLabel token : tokens) {
            int tokenBegin = token.get(CharacterOffsetBeginAnnotation.class);
            int tokenEnd = token.get(CharacterOffsetEndAnnotation.class);
            // for now we don't use POS tagging
            String pos = token.get(PartOfSpeechAnnotation.class);
            annotations.add(TOKEN, tokenBegin, tokenEnd);
            annotations.add(POS, tokenBegin, tokenEnd);
            String cat = token.get(NamedEntityTagAnnotation.class);
            NamedEntity.Category currCat = NamedEntity.Category.parse(cat);
            if (currCat != NamedEntity.Category.NONE) {
                if (prevCat != currCat) {
                    nerBegin = tokenBegin;
                }
            } else {
                if (prevCat != currCat) {
                    annotations.add(NER, nerBegin, tokenBegin, prevCat);
                }
            }
            prevCat = currCat;
        }
    }
    return annotations;
}
Also used : NamedEntity(org.icij.datashare.text.NamedEntity) CoreLabel(edu.stanford.nlp.ling.CoreLabel) Annotations(org.icij.datashare.text.nlp.Annotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) CoreMap(edu.stanford.nlp.util.CoreMap)

Example 3 with Annotations

use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.

the class CorenlpPipeline method processNerClassifier.

/**
 * Named Entity Classifier (Conditional Random Fields) only
 *
 * @param doc the document
 */
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
    Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
    NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
    LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
    // Recognize named entities from input
    final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
    abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
    String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
    List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
    // For each recognized named entity
    for (Triple<String, Integer, Integer> item : items) {
        // Triple: <category, begin, end>
        NamedEntity.Category category = NamedEntity.Category.parse(item.first());
        int begin = item.second();
        int end = item.third();
        String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
        namedEntitiesBuilder.add(category, mention, begin + contentOffset);
    }
    return namedEntitiesBuilder.build();
}
Also used : Triple(edu.stanford.nlp.util.Triple) AbstractSequenceClassifier(edu.stanford.nlp.ie.AbstractSequenceClassifier) NamedEntity(org.icij.datashare.text.NamedEntity) Annotations(org.icij.datashare.text.nlp.Annotations) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) NamedEntitiesBuilder(org.icij.datashare.text.NamedEntitiesBuilder)

Example 4 with Annotations

use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.

the class NerResourceTest method test_post_text_returns_NamedEntity_list.

@Test
public void test_post_text_returns_NamedEntity_list() throws Exception {
    Document doc = DocumentBuilder.createDoc("inline").with("This the 'foù' file content.").with(ENGLISH).build();
    final Annotations annotations = new Annotations("inline", CORENLP, ENGLISH);
    annotations.add(NlpStage.NER, 10, 13, NamedEntity.Category.PERSON);
    doReturn(asList(NamedEntity.create(NamedEntity.Category.PERSON, "foù", asList(10L), doc.getId(), "root", CORENLP, ENGLISH))).when(pipeline).process(eq(doc));
    Response response = post("/api/ner/findNames/CORENLP", doc.getContent()).response();
    List actualNerList = TypeConvert.fromJson(response.content(), List.class);
    assertThat(actualNerList).hasSize(1);
    assertThat(actualNerList.get(0)).isInstanceOf(HashMap.class);
    assertThat((Map) actualNerList.get(0)).includes(entry("mention", "foù"), entry("extractor", "CORENLP"), entry("mentionNorm", "fou"), entry("offsets", asList(10)));
}
Also used : Response(net.codestory.rest.Response) Annotations(org.icij.datashare.text.nlp.Annotations) Arrays.asList(java.util.Arrays.asList) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) Document(org.icij.datashare.text.Document) HashMap(java.util.HashMap) Map(java.util.Map) AbstractProdWebServerTest(org.icij.datashare.web.testhelpers.AbstractProdWebServerTest) Test(org.junit.Test)

Aggregations

Annotations (org.icij.datashare.text.nlp.Annotations)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 NamedEntity (org.icij.datashare.text.NamedEntity)2 AbstractSequenceClassifier (edu.stanford.nlp.ie.AbstractSequenceClassifier)1 CoreLabel (edu.stanford.nlp.ling.CoreLabel)1 HasWord (edu.stanford.nlp.ling.HasWord)1 TaggedWord (edu.stanford.nlp.ling.TaggedWord)1 MaxentTagger (edu.stanford.nlp.tagger.maxent.MaxentTagger)1 CoreMap (edu.stanford.nlp.util.CoreMap)1 Triple (edu.stanford.nlp.util.Triple)1 StringReader (java.io.StringReader)1 Arrays.asList (java.util.Arrays.asList)1 Collections.emptyList (java.util.Collections.emptyList)1 Collections.singletonList (java.util.Collections.singletonList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Response (net.codestory.rest.Response)1 Document (org.icij.datashare.text.Document)1 NamedEntitiesBuilder (org.icij.datashare.text.NamedEntitiesBuilder)1