use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.
the class CorenlpPipeline method processPosClassifier.
/**
* Part-of-Speech Classification (Maximum entropy) only
*
* @param input the string to annotator
* @param hash the input hash code
* @param language the input language
*/
private Annotations processPosClassifier(String input, String hash, Language language) throws InterruptedException {
Annotations annotations = new Annotations(hash, getType(), language);
LOGGER.info("POS-tagging for " + language.toString());
// Split input into sentences
final CoreNlpAnnotator<MaxentTagger> nlpAnnotator;
nlpAnnotator = CoreNlpPosModels.getInstance().get(language);
List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(input));
for (List<HasWord> sentence : sentences) {
// NlpTag with parts-of-speech
List<TaggedWord> taggedSentence = nlpAnnotator.annotator.tagSentence(sentence);
// Feed annotatopn
for (TaggedWord word : taggedSentence) {
int begin = word.beginPosition();
int end = word.endPosition();
// like line 157 we don't use POS tagging
String pos = word.tag();
annotations.add(POS, begin, end);
}
}
return annotations;
}
use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.
the class CorenlpPipeline method processPipeline.
/**
* Process with entire pipelines
*
* @param input the string to annotator
* @param hash the input hash code
* @param language the input language
* @return
*/
private Annotations processPipeline(String input, String hash, Language language) throws InterruptedException {
Annotations annotations = new Annotations(hash, getType(), language);
// CoreNLP annotations data-structure
edu.stanford.nlp.pipeline.Annotation coreNlpAnnotation = new edu.stanford.nlp.pipeline.Annotation(input);
LOGGER.info("sentencing ~ tokenizing ~ POS-tagging ~ name-finding for " + language.toString());
// Sentencize input
// Tokenize
// Pos-tag
// NER
CoreNlpPipelineModels.getInstance().get(language).annotate(coreNlpAnnotation);
// Feed annotations
List<CoreMap> sentences = coreNlpAnnotation.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
int sentenceBegin = sentence.get(CharacterOffsetBeginAnnotation.class);
int sentenceEnd = sentence.get(CharacterOffsetEndAnnotation.class);
annotations.add(SENTENCE, sentenceBegin, sentenceEnd);
int nerBegin = 0;
NamedEntity.Category prevCat = NamedEntity.Category.NONE;
List<CoreLabel> tokens = sentence.get(TokensAnnotation.class);
for (CoreLabel token : tokens) {
int tokenBegin = token.get(CharacterOffsetBeginAnnotation.class);
int tokenEnd = token.get(CharacterOffsetEndAnnotation.class);
// for now we don't use POS tagging
String pos = token.get(PartOfSpeechAnnotation.class);
annotations.add(TOKEN, tokenBegin, tokenEnd);
annotations.add(POS, tokenBegin, tokenEnd);
String cat = token.get(NamedEntityTagAnnotation.class);
NamedEntity.Category currCat = NamedEntity.Category.parse(cat);
if (currCat != NamedEntity.Category.NONE) {
if (prevCat != currCat) {
nerBegin = tokenBegin;
}
} else {
if (prevCat != currCat) {
annotations.add(NER, nerBegin, tokenBegin, prevCat);
}
}
prevCat = currCat;
}
}
return annotations;
}
use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.
the class CorenlpPipeline method processNerClassifier.
/**
* Named Entity Classifier (Conditional Random Fields) only
*
* @param doc the document
*/
private List<NamedEntity> processNerClassifier(Document doc, int contentLength, int contentOffset) throws InterruptedException {
Annotations annotations = new Annotations(doc.getId(), doc.getRootDocument(), getType(), doc.getLanguage());
NamedEntitiesBuilder namedEntitiesBuilder = new NamedEntitiesBuilder(getType(), doc.getId(), doc.getLanguage()).withRoot(doc.getRootDocument());
LOGGER.info("name-finding for {} in document {} (offset {})", doc.getLanguage(), doc.getId(), contentOffset);
// Recognize named entities from input
final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(doc.getLanguage());
String chunk = doc.getContent().substring(contentOffset, Math.min(contentOffset + contentLength, doc.getContentTextLength()));
List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(chunk);
// For each recognized named entity
for (Triple<String, Integer, Integer> item : items) {
// Triple: <category, begin, end>
NamedEntity.Category category = NamedEntity.Category.parse(item.first());
int begin = item.second();
int end = item.third();
String mention = ThrowingFunctions.removeNewLines.apply(chunk.substring(begin, end));
namedEntitiesBuilder.add(category, mention, begin + contentOffset);
}
return namedEntitiesBuilder.build();
}
use of org.icij.datashare.text.nlp.Annotations in project datashare by ICIJ.
the class NerResourceTest method test_post_text_returns_NamedEntity_list.
@Test
public void test_post_text_returns_NamedEntity_list() throws Exception {
Document doc = DocumentBuilder.createDoc("inline").with("This the 'foù' file content.").with(ENGLISH).build();
final Annotations annotations = new Annotations("inline", CORENLP, ENGLISH);
annotations.add(NlpStage.NER, 10, 13, NamedEntity.Category.PERSON);
doReturn(asList(NamedEntity.create(NamedEntity.Category.PERSON, "foù", asList(10L), doc.getId(), "root", CORENLP, ENGLISH))).when(pipeline).process(eq(doc));
Response response = post("/api/ner/findNames/CORENLP", doc.getContent()).response();
List actualNerList = TypeConvert.fromJson(response.content(), List.class);
assertThat(actualNerList).hasSize(1);
assertThat(actualNerList.get(0)).isInstanceOf(HashMap.class);
assertThat((Map) actualNerList.get(0)).includes(entry("mention", "foù"), entry("extractor", "CORENLP"), entry("mentionNorm", "fou"), entry("offsets", asList(10)));
}
Aggregations