use of edu.stanford.nlp.util.CoreMap in project lucida by claritylab.
the class TextProcessor method parse.
/**
* Returns the date range string by parsing text.
*
* @param text String representing the original query
*/
public String[] parse(String text) {
// The pipelines may produce same results, so store results in a set.
TreeSet<Temporal> has_seen = new TreeSet<Temporal>(new TemporalComparator());
// Time is comparable, so add temporal of type time and date into a TreeSet to get
// the minimum and maximum time which define the range for event retrieval.
TreeSet<Time> times = new TreeSet<Time>();
for (AnnotationPipeline pipeline : pieplines) {
Annotation annotation = new Annotation(text);
annotation.set(CoreAnnotations.DocDateAnnotation.class, new SimpleDateFormat("yyyy-MM-dd").format(new Date()));
pipeline.annotate(annotation);
List<CoreMap> timexAnnsAll = annotation.get(TimeAnnotations.TimexAnnotations.class);
for (CoreMap cm : timexAnnsAll) {
Temporal temporal = cm.get(TimeExpression.Annotation.class).getTemporal();
temporal.getTime();
if (has_seen.contains(temporal)) {
continue;
}
has_seen.add(temporal);
if (temporal.getTimexType().name().equals("TIME") || temporal.getTimexType().name().equals("DATE")) {
if (temporal.getTime() != null) {
try {
times.add(temporal.getTime());
} catch (NullPointerException e) {
}
}
}
}
}
// Get the minimum and maximum time only if there are at least two Time objects in times.
if (times.size() >= 2) {
return new String[] { regexNormalize(Collections.min(times).toString(), 0), regexNormalize(Collections.max(times).toString(), 1) };
}
// Since the range couldn't be defined by times, define the range from has_seen.
for (Temporal temporal : has_seen) {
// Due to a bug (?) in coreNLP, getRange() for "current week" will result in year 2015.
// Thus, try parsing as week before getRange().
String[] try_parse_as_week = parseAsWeek(temporal.toString(), text);
if (try_parse_as_week != null) {
return try_parse_as_week;
}
if (isReadbleTime(temporal.getRange().toString())) {
List<String> string_list = Arrays.asList(temporal.getRange().toString().split(","));
String s1 = regexNormalize(string_list.get(0), 0);
String s2 = regexNormalize(string_list.get(1), 1);
if (s1.length() >= 10 && s2.length() >= 10 && s1.substring(0, 10).equals(s2.substring(0, 10))) {
if (text.contains("from") || text.contains("start") || text.contains("begin")) {
s2 = null;
} else if (text.contains("until")) {
s1 = null;
}
}
return new String[] { s1, s2 };
}
}
// No temporal expression is found by any pipeline.
return new String[] { null, null };
}
use of edu.stanford.nlp.util.CoreMap in project textdb by TextDB.
the class NlpEntityOperator method extractNlpSpans.
/**
* @param iField
* @param attributeName
* @return
* @about This function takes an IField(TextField) and a String (the field's
* name) as input and uses the Stanford NLP package to process the
* field based on the input token type and nlpTypeIndicator. In the
* result spans, value represents the word itself and key represents
* the recognized token type
* @overview First set up a pipeline of Annotators based on the
* nlpTypeIndicator. If the nlpTypeIndicator is "NE_ALL", we set
* up the NamedEntityTagAnnotator, if it's "POS", then only
* PartOfSpeechAnnotator is needed.
* <p>
* The pipeline has to be this order: TokenizerAnnotator,
* SentencesAnnotator, PartOfSpeechAnnotator, LemmaAnnotator and
* NamedEntityTagAnnotator.
* <p>
* In the pipeline, each token is wrapped as a CoreLabel and each
* sentence is wrapped as CoreMap. Each annotator adds its
* annotation to the CoreMap(sentence) or CoreLabel(token) object.
* <p>
* After the pipeline, scan each CoreLabel(token) for its
* NamedEntityAnnotation or PartOfSpeechAnnotator depends on the
* nlpTypeIndicator
* <p>
* For each Stanford NLP annotation, get it's corresponding
* inputnlpEntityType that used in this package, then check if it
* equals to the input token type. If yes, makes it a span and add
* to the return list.
* <p>
* The NLP package has annotations for the start and end position
* of a token and it perfectly matches the span design so we just
* use them.
* <p>
* For Example: With TextField value: "Microsoft, Google and
* Facebook are organizations while Donald Trump and Barack Obama
* are persons", with attributeName: Sentence1 and inputTokenType is
* Organization. Since the inputTokenType require us to use
* NamedEntity Annotator in the Stanford NLP package, the
* nlpTypeIndicator would be set to "NE". The pipeline would set
* up to cover the Named Entity Recognizer. Then get the value of
* NamedEntityTagAnnotation for each CoreLabel(token).If the value
* is the token type "Organization", then it meets the
* requirement. In this case "Microsoft","Google" and "Facebook"
* will satisfy the requirement. "Donald Trump" and "Barack Obama"
* would have token type "Person" and do not meet the requirement.
* For each qualified token, create a span accordingly and add it
* to the returned list. In this case, token "Microsoft" would be
* span: ["Sentence1", 0, 9, Organization, "Microsoft"]
*/
private List<Span> extractNlpSpans(IField iField, String attributeName) {
List<Span> spanList = new ArrayList<>();
String text = (String) iField.getValue();
Properties props = new Properties();
// Setup Stanford NLP pipeline based on nlpTypeIndicator
StanfordCoreNLP pipeline = null;
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
props.setProperty("annotators", "tokenize, ssplit, pos");
if (posPipeline == null) {
posPipeline = new StanfordCoreNLP(props);
}
pipeline = posPipeline;
} else {
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, " + "ner");
if (nerPipeline == null) {
nerPipeline = new StanfordCoreNLP(props);
}
pipeline = nerPipeline;
}
Annotation documentAnnotation = new Annotation(text);
pipeline.annotate(documentAnnotation);
List<CoreMap> sentences = documentAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String stanfordNlpConstant;
// Extract annotations based on nlpTypeIndicator
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
stanfordNlpConstant = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
} else {
stanfordNlpConstant = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
}
NlpEntityType nlpEntityType = mapNlpEntityType(stanfordNlpConstant);
if (nlpEntityType == null) {
continue;
}
if (predicate.getNlpEntityType().equals(NlpEntityType.NE_ALL) || predicate.getNlpEntityType().equals(nlpEntityType)) {
int start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String word = token.get(CoreAnnotations.TextAnnotation.class);
Span span = new Span(attributeName, start, end, nlpEntityType.toString(), word);
if (spanList.size() >= 1 && (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("NE_ALL"))) {
Span previousSpan = spanList.get(spanList.size() - 1);
if (previousSpan.getAttributeName().equals(span.getAttributeName()) && (span.getStart() - previousSpan.getEnd() <= 1) && previousSpan.getKey().equals(span.getKey())) {
Span newSpan = mergeTwoSpans(previousSpan, span);
span = newSpan;
spanList.remove(spanList.size() - 1);
}
}
spanList.add(span);
}
}
}
return spanList;
}
use of edu.stanford.nlp.util.CoreMap in project textdb by TextDB.
the class NlpSentimentOperator method computeSentimentScore.
private Integer computeSentimentScore(Tuple inputTuple) {
String inputText = inputTuple.<IField>getField(predicate.getInputAttributeName()).getValue().toString();
Annotation documentAnnotation = new Annotation(inputText);
sentimentPipeline.annotate(documentAnnotation);
// mainSentiment is calculated by the sentiment class of the longest sentence
Integer mainSentiment = 0;
Integer longestSentenceLength = 0;
for (CoreMap sentence : documentAnnotation.get(CoreAnnotations.SentencesAnnotation.class)) {
Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
int sentiment = RNNCoreAnnotations.getPredictedClass(tree);
String sentenceText = sentence.toString();
if (sentenceText.length() > longestSentenceLength) {
mainSentiment = sentiment;
}
}
return mainSentiment;
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class NERServlet method outputHighlighting.
public void outputHighlighting(PrintWriter out, CRFClassifier classifier, String input) {
Set<String> labels = classifier.labels();
String background = classifier.backgroundSymbol();
List<List<CoreMap>> sentences = classifier.classify(input);
Map<String, Color> tagToColorMap = NERGUI.makeTagToColorMap(labels, background);
StringBuilder result = new StringBuilder();
int lastEndOffset = 0;
for (List<CoreMap> sentence : sentences) {
for (CoreMap word : sentence) {
int beginOffset = word.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int endOffset = word.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String answer = word.get(CoreAnnotations.AnswerAnnotation.class);
if (beginOffset > lastEndOffset) {
result.append(StringEscapeUtils.escapeHtml4(input.substring(lastEndOffset, beginOffset)));
}
// Add a color bar for any tagged words
if (!background.equals(answer)) {
Color color = tagToColorMap.get(answer);
result.append("<span style=\"color:#ffffff;background:" + NERGUI.colorToHTML(color) + "\">");
}
result.append(StringEscapeUtils.escapeHtml4(input.substring(beginOffset, endOffset)));
// Turn off the color bar
if (!background.equals(answer)) {
result.append("</span>");
}
lastEndOffset = endOffset;
}
}
if (lastEndOffset < input.length()) {
result.append(StringEscapeUtils.escapeHtml4(input.substring(lastEndOffset)));
}
result.append("<br><br>");
result.append("Potential tags:");
for (String label : tagToColorMap.keySet()) {
result.append("<br> ");
Color color = tagToColorMap.get(label);
result.append("<span style=\"color:#ffffff;background:" + NERGUI.colorToHTML(color) + "\">");
result.append(StringEscapeUtils.escapeHtml4(label));
result.append("</span>");
}
out.print(result.toString());
}
use of edu.stanford.nlp.util.CoreMap in project CoreNLP by stanfordnlp.
the class TokenSequenceMatcherITest method _testTokenSequenceFindsWildcard.
public void _testTokenSequenceFindsWildcard() throws IOException {
CoreMap doc = createDocument("word1 word2");
// Test sequence with groups
TokenSequencePattern p = TokenSequencePattern.compile("[]{2}|[]");
TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
boolean match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("word1 word2", m.group());
match = m.find();
assertFalse(match);
// Reverse order
p = TokenSequencePattern.compile("[]|[]{2}");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("word1 word2", m.group());
match = m.find();
assertFalse(match);
// Using {1,2}
p = TokenSequencePattern.compile("[]{2}");
m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class));
match = m.find();
assertTrue(match);
assertEquals(0, m.groupCount());
assertEquals("word1 word2", m.group());
match = m.find();
assertFalse(match);
}
Aggregations