use of edu.stanford.nlp.pipeline.Annotation in project textdb by TextDB.
the class NlpEntityOperator method extractNlpSpans.
/**
* @param iField
* @param attributeName
* @return
* @about This function takes an IField(TextField) and a String (the field's
* name) as input and uses the Stanford NLP package to process the
* field based on the input token type and nlpTypeIndicator. In the
* result spans, value represents the word itself and key represents
* the recognized token type
* @overview First set up a pipeline of Annotators based on the
* nlpTypeIndicator. If the nlpTypeIndicator is "NE_ALL", we set
* up the NamedEntityTagAnnotator, if it's "POS", then only
* PartOfSpeechAnnotator is needed.
* <p>
* The pipeline has to be this order: TokenizerAnnotator,
* SentencesAnnotator, PartOfSpeechAnnotator, LemmaAnnotator and
* NamedEntityTagAnnotator.
* <p>
* In the pipeline, each token is wrapped as a CoreLabel and each
* sentence is wrapped as CoreMap. Each annotator adds its
* annotation to the CoreMap(sentence) or CoreLabel(token) object.
* <p>
* After the pipeline, scan each CoreLabel(token) for its
* NamedEntityAnnotation or PartOfSpeechAnnotator depends on the
* nlpTypeIndicator
* <p>
* For each Stanford NLP annotation, get it's corresponding
* inputnlpEntityType that used in this package, then check if it
* equals to the input token type. If yes, makes it a span and add
* to the return list.
* <p>
* The NLP package has annotations for the start and end position
* of a token and it perfectly matches the span design so we just
* use them.
* <p>
* For Example: With TextField value: "Microsoft, Google and
* Facebook are organizations while Donald Trump and Barack Obama
* are persons", with attributeName: Sentence1 and inputTokenType is
* Organization. Since the inputTokenType require us to use
* NamedEntity Annotator in the Stanford NLP package, the
* nlpTypeIndicator would be set to "NE". The pipeline would set
* up to cover the Named Entity Recognizer. Then get the value of
* NamedEntityTagAnnotation for each CoreLabel(token).If the value
* is the token type "Organization", then it meets the
* requirement. In this case "Microsoft","Google" and "Facebook"
* will satisfy the requirement. "Donald Trump" and "Barack Obama"
* would have token type "Person" and do not meet the requirement.
* For each qualified token, create a span accordingly and add it
* to the returned list. In this case, token "Microsoft" would be
* span: ["Sentence1", 0, 9, Organization, "Microsoft"]
*/
private List<Span> extractNlpSpans(IField iField, String attributeName) {
List<Span> spanList = new ArrayList<>();
String text = (String) iField.getValue();
Properties props = new Properties();
// Setup Stanford NLP pipeline based on nlpTypeIndicator
StanfordCoreNLP pipeline = null;
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
props.setProperty("annotators", "tokenize, ssplit, pos");
if (posPipeline == null) {
posPipeline = new StanfordCoreNLP(props);
}
pipeline = posPipeline;
} else {
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, " + "ner");
if (nerPipeline == null) {
nerPipeline = new StanfordCoreNLP(props);
}
pipeline = nerPipeline;
}
Annotation documentAnnotation = new Annotation(text);
pipeline.annotate(documentAnnotation);
List<CoreMap> sentences = documentAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String stanfordNlpConstant;
// Extract annotations based on nlpTypeIndicator
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
stanfordNlpConstant = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
} else {
stanfordNlpConstant = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
}
NlpEntityType nlpEntityType = mapNlpEntityType(stanfordNlpConstant);
if (nlpEntityType == null) {
continue;
}
if (predicate.getNlpEntityType().equals(NlpEntityType.NE_ALL) || predicate.getNlpEntityType().equals(nlpEntityType)) {
int start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String word = token.get(CoreAnnotations.TextAnnotation.class);
Span span = new Span(attributeName, start, end, nlpEntityType.toString(), word);
if (spanList.size() >= 1 && (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("NE_ALL"))) {
Span previousSpan = spanList.get(spanList.size() - 1);
if (previousSpan.getAttributeName().equals(span.getAttributeName()) && (span.getStart() - previousSpan.getEnd() <= 1) && previousSpan.getKey().equals(span.getKey())) {
Span newSpan = mergeTwoSpans(previousSpan, span);
span = newSpan;
spanList.remove(spanList.size() - 1);
}
}
spanList.add(span);
}
}
}
return spanList;
}
use of edu.stanford.nlp.pipeline.Annotation in project textdb by TextDB.
the class NlpSentimentOperator method computeSentimentScore.
private Integer computeSentimentScore(Tuple inputTuple) {
String inputText = inputTuple.<IField>getField(predicate.getInputAttributeName()).getValue().toString();
Annotation documentAnnotation = new Annotation(inputText);
sentimentPipeline.annotate(documentAnnotation);
// mainSentiment is calculated by the sentiment class of the longest sentence
Integer mainSentiment = 0;
Integer longestSentenceLength = 0;
for (CoreMap sentence : documentAnnotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
String sentenceText = sentence.toString();
if (sentenceText.length() > longestSentenceLength) {
mainSentiment = sentiment;
}
}
return mainSentiment;
}
use of edu.stanford.nlp.pipeline.Annotation in project CoreNLP by stanfordnlp.
the class TSVSentenceProcessor method runAndExit.
/**
* Runs the given implementation of ForEachSentence, and then exits with the appropriate error code
* (that is, the number of exceptions encountered during processing)
* @param in The input stream to read examples off of.
* @param debugStream The stream to write debugging information to (e.g., stderr).
* @param cleanup A function to run after annotation is over, to clean up open files, etc.
* Takes as input the candidate error code, and returns a new error code to exit on.
* @param sentenceTableSpec The header of the sentence table fields being fed as input to this function.
* By default, this can be {@link TSVSentenceProcessor#DEFAULT_SENTENCE_TABLE}.
*/
default default void runAndExit(InputStream in, PrintStream debugStream, Function<Integer, Integer> cleanup, List<SentenceField> sentenceTableSpec) {
int exceptions = 0;
try {
BufferedReader stdin = new BufferedReader(new InputStreamReader(in));
int linesProcessed = 0;
long startTime = System.currentTimeMillis();
for (String line; (line = stdin.readLine()) != null; ) {
long id = -1;
try {
// Parse line
String[] fields = line.split("\t");
id = Long.parseLong(fields[0]);
// Create Annotation
Annotation doc = TSVUtils.parseSentence(Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.DOC_ID)]), Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.SENTENCE_INDEX)]), fields[sentenceTableSpec.indexOf(SentenceField.GLOSS)], fields[sentenceTableSpec.indexOf(SentenceField.DEPENDENCIES_STANFORD)], fields[sentenceTableSpec.indexOf(SentenceField.DEPENDENCIES_MALT)], fields[sentenceTableSpec.indexOf(SentenceField.WORDS)], fields[sentenceTableSpec.indexOf(SentenceField.LEMMAS)], fields[sentenceTableSpec.indexOf(SentenceField.POS_TAGS)], fields[sentenceTableSpec.indexOf(SentenceField.NER_TAGS)], Optional.of(fields[sentenceTableSpec.indexOf(SentenceField.ID)]));
// Process document
process(id, doc);
// Debug
linesProcessed += 1;
if (linesProcessed % 1000 == 0) {
long currTime = System.currentTimeMillis();
long sentPerSec = linesProcessed / ((currTime - startTime) / 1000);
debugStream.println("[" + Redwood.formatTimeDifference(currTime - startTime) + "] Processed " + linesProcessed + " sentences {" + sentPerSec + " sentences / second}... ");
}
} catch (Exception t) {
debugStream.println("CAUGHT EXCEPTION ON SENTENCE ID: " + id + " (-1 if not known)");
t.printStackTrace(debugStream);
exceptions += 1;
}
}
// DONE
debugStream.println("[" + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + "] DONE");
} catch (Exception t) {
debugStream.println("FATAL EXCEPTION!");
t.printStackTrace(debugStream);
exceptions += 1;
} finally {
debugStream.flush();
debugStream.close();
}
System.exit(cleanup.apply(exceptions));
}
use of edu.stanford.nlp.pipeline.Annotation in project neo4j-nlp-stanfordnlp by graphaware.
the class StanfordTextProcessor method annotateTag.
@Override
public Tag annotateTag(String text, String lang, PipelineSpecification pipelineSpecification) {
Annotation document = new Annotation(text);
pipelines.get(pipelineSpecification.getName()).annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
Optional<CoreMap> sentence = sentences.stream().findFirst();
if (sentence.isPresent()) {
List<CoreLabel> tokens = sentence.get().get(CoreAnnotations.TokensAnnotation.class);
if (tokens != null) {
if (tokens.size() == 1) {
Optional<Tag> oTag = tokens.stream().map((token) -> getTag(lang, token)).filter((tag) -> (tag != null) && checkLemmaIsValid(tag.getLemma())).findFirst();
if (oTag.isPresent()) {
return oTag.get();
}
} else if (tokens.size() > 1) {
Tag tag = new Tag(text, lang);
tag.setPos(Arrays.asList());
tag.setNe(Arrays.asList());
LOG.info("POS: " + tag.getPosAsList() + " ne: " + tag.getNeAsList() + " lemma: " + tag.getLemma());
return tag;
}
}
}
return null;
}
use of edu.stanford.nlp.pipeline.Annotation in project neo4j-nlp-stanfordnlp by graphaware.
the class DependencyParserTest method testStanfordTypedDependenciesParsing.
@Test
public void testStanfordTypedDependenciesParsing() {
StanfordCoreNLP pipeline = ((StanfordTextProcessor) textProcessor).getPipeline("default");
String text = "Show me Josh Wedhon latest movies";
Annotation document = new Annotation(text);
pipeline.annotate(document);
List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
CoreMap sentence = sentences.get(0);
System.out.println(sentence.toString());
SemanticGraph graph = sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class);
System.out.println(graph);
List<SemanticGraphEdge> edges = graph.edgeListSorted();
for (SemanticGraphEdge edge : edges) {
System.out.println(edge.getRelation().getSpecific());
System.out.println(edge.getRelation().getShortName());
System.out.println(String.format("Source is : %s - Target is : %s - Relation is : %s", edge.getSource(), edge.getTarget(), edge.getRelation()));
}
}
Aggregations