use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class NlpEntityOperator method extractNlpSpans.
/**
* @param iField
* @param attributeName
* @return
* @about This function takes an IField(TextField) and a String (the field's
* name) as input and uses the Stanford NLP package to process the
* field based on the input token type and nlpTypeIndicator. In the
* result spans, value represents the word itself and key represents
* the recognized token type
* @overview First set up a pipeline of Annotators based on the
* nlpTypeIndicator. If the nlpTypeIndicator is "NE_ALL", we set
* up the NamedEntityTagAnnotator, if it's "POS", then only
* PartOfSpeechAnnotator is needed.
* <p>
* The pipeline has to be this order: TokenizerAnnotator,
* SentencesAnnotator, PartOfSpeechAnnotator, LemmaAnnotator and
* NamedEntityTagAnnotator.
* <p>
* In the pipeline, each token is wrapped as a CoreLabel and each
* sentence is wrapped as CoreMap. Each annotator adds its
* annotation to the CoreMap(sentence) or CoreLabel(token) object.
* <p>
* After the pipeline, scan each CoreLabel(token) for its
* NamedEntityAnnotation or PartOfSpeechAnnotator depends on the
* nlpTypeIndicator
* <p>
* For each Stanford NLP annotation, get it's corresponding
* inputnlpEntityType that used in this package, then check if it
* equals to the input token type. If yes, makes it a span and add
* to the return list.
* <p>
* The NLP package has annotations for the start and end position
* of a token and it perfectly matches the span design so we just
* use them.
* <p>
* For Example: With TextField value: "Microsoft, Google and
* Facebook are organizations while Donald Trump and Barack Obama
* are persons", with attributeName: Sentence1 and inputTokenType is
* Organization. Since the inputTokenType require us to use
* NamedEntity Annotator in the Stanford NLP package, the
* nlpTypeIndicator would be set to "NE". The pipeline would set
* up to cover the Named Entity Recognizer. Then get the value of
* NamedEntityTagAnnotation for each CoreLabel(token).If the value
* is the token type "Organization", then it meets the
* requirement. In this case "Microsoft","Google" and "Facebook"
* will satisfy the requirement. "Donald Trump" and "Barack Obama"
* would have token type "Person" and do not meet the requirement.
* For each qualified token, create a span accordingly and add it
* to the returned list. In this case, token "Microsoft" would be
* span: ["Sentence1", 0, 9, Organization, "Microsoft"]
*/
private List<Span> extractNlpSpans(IField iField, String attributeName) {
List<Span> spanList = new ArrayList<>();
String text = (String) iField.getValue();
Properties props = new Properties();
// Setup Stanford NLP pipeline based on nlpTypeIndicator
StanfordCoreNLP pipeline = null;
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
props.setProperty("annotators", "tokenize, ssplit, pos");
if (posPipeline == null) {
posPipeline = new StanfordCoreNLP(props);
}
pipeline = posPipeline;
} else {
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, " + "ner");
if (nerPipeline == null) {
nerPipeline = new StanfordCoreNLP(props);
}
pipeline = nerPipeline;
}
Annotation documentAnnotation = new Annotation(text);
pipeline.annotate(documentAnnotation);
List<CoreMap> sentences = documentAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String stanfordNlpConstant;
// Extract annotations based on nlpTypeIndicator
if (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("POS")) {
stanfordNlpConstant = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
} else {
stanfordNlpConstant = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
}
NlpEntityType nlpEntityType = mapNlpEntityType(stanfordNlpConstant);
if (nlpEntityType == null) {
continue;
}
if (predicate.getNlpEntityType().equals(NlpEntityType.NE_ALL) || predicate.getNlpEntityType().equals(nlpEntityType)) {
int start = token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int end = token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
String word = token.get(CoreAnnotations.TextAnnotation.class);
Span span = new Span(attributeName, start, end, nlpEntityType.toString(), word);
if (spanList.size() >= 1 && (getNlpTypeIndicator(predicate.getNlpEntityType()).equals("NE_ALL"))) {
Span previousSpan = spanList.get(spanList.size() - 1);
if (previousSpan.getAttributeName().equals(span.getAttributeName()) && (span.getStart() - previousSpan.getEnd() <= 1) && previousSpan.getKey().equals(span.getKey())) {
Span newSpan = mergeTwoSpans(previousSpan, span);
span = newSpan;
spanList.remove(spanList.size() - 1);
}
}
spanList.add(span);
}
}
}
return spanList;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class FuzzyTokenMatcher method processOneInputTuple.
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws TextDBException {
ListField<Span> payloadField = inputTuple.getField(SchemaConstants.PAYLOAD);
List<Span> payload = payloadField.getValue();
List<Span> relevantSpans = filterRelevantSpans(payload);
List<Span> matchResults = new ArrayList<>();
/*
* The source operator returns spans even for those fields which did not
* satisfy the threshold criterion. So if two attributes A,B have 10 and
* 5 matching tokens, and we set threshold to 10, the number of spans
* returned is 15. So we need to filter those 5 spans for attribute B.
*/
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getAttributeType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.TEXT && attributeType != AttributeType.STRING) {
throw new DataFlowException("FuzzyTokenMatcher: Fields other than TEXT or STRING are not supported");
}
List<Span> fieldSpans = relevantSpans.stream().filter(span -> span.getAttributeName().equals(attributeName)).filter(span -> predicate.getQueryTokens().contains(span.getKey())).collect(Collectors.toList());
if (fieldSpans.size() >= predicate.getThreshold()) {
matchResults.addAll(fieldSpans);
}
}
if (matchResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchResults);
return inputTuple;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class FuzzyTokenMatcher method filterRelevantSpans.
private List<Span> filterRelevantSpans(List<Span> spanList) {
List<Span> relevantSpans = new ArrayList<>();
Iterator<Span> iterator = spanList.iterator();
while (iterator.hasNext()) {
Span span = iterator.next();
if (predicate.getQueryTokens().contains(span.getKey())) {
relevantSpans.add(span);
}
}
return relevantSpans;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class RegexMatcher method javaRegexMatch.
private List<Span> javaRegexMatch(String fieldValue, String attributeName) {
List<Span> matchingResults = new ArrayList<>();
java.util.regex.Matcher javaMatcher = this.javaPattern.matcher(fieldValue);
while (javaMatcher.find()) {
int start = javaMatcher.start();
int end = javaMatcher.end();
matchingResults.add(new Span(attributeName, start, end, this.predicate.getRegex(), fieldValue.substring(start, end)));
}
return matchingResults;
}
use of edu.uci.ics.textdb.api.span.Span in project textdb by TextDB.
the class RegexMatcher method processOneInputTuple.
/**
* This function returns a list of spans in the given tuple that match the
* regex For example, given tuple ("george watson", "graduate student", 23,
* "(949)888-8888") and regex "g[^\s]*", this function will return
* [Span(name, 0, 6, "g[^\s]*", "george watson"), Span(position, 0, 8,
* "g[^\s]*", "graduate student")]
*
* @param tuple
* document in which search is performed
* @return a list of spans describing the occurrence of a matching sequence
* in the document
* @throws DataFlowException
*/
@Override
public Tuple processOneInputTuple(Tuple inputTuple) throws DataFlowException {
if (inputTuple == null) {
return null;
}
List<Span> matchingResults = new ArrayList<>();
for (String attributeName : predicate.getAttributeNames()) {
AttributeType attributeType = inputSchema.getAttribute(attributeName).getAttributeType();
String fieldValue = inputTuple.getField(attributeName).getValue().toString();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataFlowException("KeywordMatcher: Fields other than STRING and TEXT are not supported yet");
}
switch(regexEngine) {
case JavaRegex:
matchingResults.addAll(javaRegexMatch(fieldValue, attributeName));
break;
case RE2J:
matchingResults.addAll(re2jRegexMatch(fieldValue, attributeName));
break;
}
}
if (matchingResults.isEmpty()) {
return null;
}
ListField<Span> spanListField = inputTuple.getField(predicate.getSpanListName());
List<Span> spanList = spanListField.getValue();
spanList.addAll(matchingResults);
return inputTuple;
}
Aggregations