use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.
the class NumberNormalizer method findNumberRanges.
public static List<CoreMap> findNumberRanges(CoreMap annotation) {
List<CoreMap> numerizedTokens = annotation.get(CoreAnnotations.NumerizedTokensAnnotation.class);
for (CoreMap token : numerizedTokens) {
String w = token.get(CoreAnnotations.TextAnnotation.class);
w = w.trim().toLowerCase();
Matcher rangeMatcher = NumberNormalizer.numRangePattern.matcher(w);
if (rangeMatcher.matches()) {
try {
String w1 = rangeMatcher.group(1);
String w2 = rangeMatcher.group(2);
Number v1 = NumberNormalizer.wordToNumber(w1);
Number v2 = NumberNormalizer.wordToNumber(w2);
if (v2.doubleValue() > v1.doubleValue()) {
token.set(CoreAnnotations.NumericTypeAnnotation.class, "NUMBER_RANGE");
token.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE");
Pair<Number, Number> range = new Pair<>(v1, v2);
token.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range);
}
} catch (Exception ex) {
logger.warning("Error interpreting number range " + w + ": " + ex.getMessage());
}
}
}
List<CoreMap> numberRanges = new ArrayList<>();
TokenSequenceMatcher matcher = rangePattern.getMatcher(numerizedTokens);
while (matcher.find()) {
List<CoreMap> matched = matcher.groupNodes();
if (matched.size() == 1) {
numberRanges.add(matched.get(0));
} else {
Number v1 = matched.get(0).get(CoreAnnotations.NumericCompositeValueAnnotation.class);
Number v2 = matched.get(matched.size() - 1).get(CoreAnnotations.NumericCompositeValueAnnotation.class);
if (v2.doubleValue() > v1.doubleValue()) {
CoreMap newChunk = CoreMapAggregator.getDefaultAggregator().merge(numerizedTokens, matcher.start(), matcher.end());
newChunk.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE");
Pair<Number, Number> range = new Pair<>(v1, v2);
newChunk.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range);
numberRanges.add(newChunk);
}
}
}
return numberRanges;
}
use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.
the class RelationTripleSegmenter method extract.
/**
* Extract the nominal patterns from this sentence.
*
* @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
* @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
*
* @param parse The parse tree of the sentence to annotate.
* @param tokens The tokens of the sentence to annotate.
* @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
*/
@SuppressWarnings("unchecked")
public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
List<RelationTriple> extractions = new ArrayList<>();
Set<Triple<Span, String, Span>> alreadyExtracted = new HashSet<>();
//
for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
while (tokenMatcher.find()) {
boolean missingPrefixBe;
boolean missingSuffixOf = false;
// Create subject
List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
List<CoreLabel> subjectTokens = new ArrayList<>();
for (int i : subjectSpan) {
subjectTokens.add(tokens.get(i));
}
// Create object
List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
if (Span.overlaps(subjectSpan, objectSpan)) {
continue;
}
List<CoreLabel> objectTokens = new ArrayList<>();
for (int i : objectSpan) {
objectTokens.add(tokens.get(i));
}
// Create relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
List<CoreLabel> relationTokens = new ArrayList<>();
// (add the 'be')
missingPrefixBe = true;
// (add a complement to the 'be')
List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
if (beofComp != null) {
// (add the complement
for (CoreMap token : beofComp) {
if (token instanceof CoreLabel) {
relationTokens.add((CoreLabel) token);
} else {
relationTokens.add(new CoreLabel(token));
}
}
// (add the 'of')
missingSuffixOf = true;
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
//noinspection ConstantConditions
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixOf(missingSuffixOf);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
//
for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
SemgrexMatcher matcher = semgrex.matcher(parse);
while (matcher.find()) {
boolean missingPrefixBe = false;
boolean missingSuffixBe = false;
boolean istmod = false;
// Get relaux if applicable
String relaux = matcher.getRelnString("relaux");
String ignoredArc = relaux;
if (ignoredArc == null) {
ignoredArc = matcher.getRelnString("arc");
}
// Create subject
IndexedWord subject = matcher.getNode("subject");
List<IndexedWord> subjectTokens = new ArrayList<>();
Span subjectSpan;
if (subject.ner() != null && !"O".equals(subject.ner())) {
subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
for (int i : subjectSpan) {
subjectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
subjectSpan = Util.tokensToSpan(subjectTokens);
}
// Create object
IndexedWord object = matcher.getNode("object");
List<IndexedWord> objectTokens = new ArrayList<>();
Span objectSpan;
if (object.ner() != null && !"O".equals(object.ner())) {
objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
for (int i : objectSpan) {
objectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
objectSpan = Util.tokensToSpan(objectTokens);
}
// Check that the pair is valid
if (Span.overlaps(subjectSpan, objectSpan)) {
// We extracted an identity
continue;
}
if (subjectSpan.end() == objectSpan.start() - 1 && (tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(subjectSpan.end()).tag()))) {
// We're straddling a clause
continue;
}
if (objectSpan.end() == subjectSpan.start() - 1 && (tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(objectSpan.end()).tag()))) {
// We're straddling a clause
continue;
}
// Get any prepositional edges
String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
IndexedWord prepWord = null;
// (these usually come from the object)
boolean prepositionIsPrefix = false;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
if (edge.getRelation().toString().equals("case")) {
prepWord = edge.getDependent();
}
}
// (...but sometimes from the subject)
if (prepWord == null) {
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
if (edge.getRelation().toString().equals("case")) {
prepositionIsPrefix = true;
prepWord = edge.getDependent();
}
}
}
List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
if (prepWord != null && !expected.equals("tmod")) {
Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
if (!optionalPrepChunk.isPresent()) {
continue;
}
prepChunk = optionalPrepChunk.get();
Collections.sort(prepChunk, (a, b) -> {
double val = a.pseudoPosition() - b.pseudoPosition();
if (val < 0) {
return -1;
}
if (val > 0) {
return 1;
} else {
return 0;
}
});
// ascending sort
}
// Get the relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
LinkedList<IndexedWord> relationTokens = new LinkedList<>();
IndexedWord relNode = matcher.getNode("relation");
if (relNode != null) {
// Case: we have a grounded relation span
// (add the relation)
relationTokens.add(relNode);
// (add any prepositional case markings)
if (prepositionIsPrefix) {
// We're almost certainly missing a suffix 'be'
missingSuffixBe = true;
for (int i = prepChunk.size() - 1; i >= 0; --i) {
relationTokens.addFirst(prepChunk.get(i));
}
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
} else {
// (mark it as missing a preceding 'be'
if (!expected.equals("poss")) {
missingPrefixBe = true;
}
// (add any prepositional case markings)
if (prepositionIsPrefix) {
for (int i = prepChunk.size() - 1; i >= 0; --i) {
relationTokens.addFirst(prepChunk.get(i));
}
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
// (some fine-tuning)
if (allowNominalsWithoutNER && "of".equals(expected)) {
// prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
continue;
}
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
extraction.istmod(istmod);
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixBe(missingSuffixBe);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
}
}
//
// Filter downward polarity extractions
//
Iterator<RelationTriple> iter = extractions.iterator();
while (iter.hasNext()) {
RelationTriple term = iter.next();
boolean shouldRemove = true;
for (CoreLabel token : term) {
if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null || !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
shouldRemove = false;
}
}
if (shouldRemove) {
// Don't extract things in downward polarity contexts.
iter.remove();
}
}
// Return
return extractions;
}
use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.
the class NaturalLogicAnnotator method annotateUnaries.
/**
* Annotate any unary quantifiers that weren't found in the main {@link NaturalLogicAnnotator#annotateOperators(CoreMap)} method.
* @param sentence The sentence to annotate.
*/
private static void annotateUnaries(CoreMap sentence) {
// Get tree and tokens
SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
if (tree == null) {
tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
}
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
// Get operator exists mask
boolean[] isOperator = new boolean[tokens.size()];
for (int i = 0; i < isOperator.length; ++i) {
OperatorSpec spec = tokens.get(i).get(OperatorAnnotation.class);
if (spec != null) {
for (int k = spec.quantifierBegin; k < spec.quantifierEnd; ++k) {
isOperator[k] = true;
}
}
}
// Match Semgrex
SemgrexMatcher matcher = UNARY_PATTERN.matcher(tree);
while (matcher.find()) {
// Get relevant nodes
IndexedWord quantifier = matcher.getNode("quantifier");
String word = quantifier.word().toLowerCase();
if (word.equals("a") || word.equals("an") || word.equals("the") || "CD".equals(quantifier.tag())) {
// These are absurdly common, and uninformative, and we're just going to shoot ourselves in the foot from parsing errors and idiomatic expressions.
continue;
}
IndexedWord subject = matcher.getNode("subject");
// ... If there is not already an operator there
if (!isOperator[quantifier.index() - 1]) {
Optional<Triple<Operator, Integer, Integer>> quantifierInfo = validateQuantifierByHead(sentence, quantifier);
// ... and if we found a quantifier span
if (quantifierInfo.isPresent()) {
// Then add the unary operator!
OperatorSpec scope = computeScope(tree, quantifierInfo.get().first, subject, Pair.makePair(quantifierInfo.get().second, quantifierInfo.get().third), null, false, null, tokens.size());
CoreLabel token = tokens.get(quantifier.index() - 1);
token.set(OperatorAnnotation.class, scope);
}
}
}
// Match TokensRegex
TokenSequenceMatcher tokenMatcher = DOUBT_PATTERN.matcher(tokens);
while (tokenMatcher.find()) {
List<CoreLabel> doubt = (List<CoreLabel>) tokenMatcher.groupNodes("$doubt");
List<CoreLabel> target = (List<CoreLabel>) tokenMatcher.groupNodes("$target");
for (CoreLabel word : doubt) {
OperatorSpec spec = new OperatorSpec(Operator.GENERAL_NEG_POLARITY, word.index() - 1, word.index(), target.get(0).index() - 1, target.get(target.size() - 1).index(), 0, 0, tokens.size());
word.set(OperatorAnnotation.class, spec);
}
}
}
use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.
the class NumberNormalizer method findNumbers.
/**
* Find and mark numbers (does not need NumberSequenceClassifier)
* Each token is annotated with the numeric value and type
* - CoreAnnotations.NumericTypeAnnotation.class: ORDINAL, UNIT (hundred, thousand,..., dozen, gross,...), NUMBER
* - CoreAnnotations.NumericValueAnnotation.class: Number representing the numeric value of the token
* ( two thousand => 2 1000 )
*
* Tries also to separate individual numbers like four five six,
* while keeping numbers like four hundred and seven together
* Annotate tokens belonging to each composite number with
* - CoreAnnotations.NumericCompositeTypeAnnotation.class: ORDINAL (1st, 2nd), NUMBER (one hundred)
* - CoreAnnotations.NumericCompositeValueAnnotation.class: Number representing the composite numeric value
* ( two thousand => 2000 2000 )
*
* Also returns list of CoreMap representing the identified numbers
*
* The function is overly aggressive in marking possible numbers
* - should either do more checks or use in conjunction with NumberSequenceClassifier
* to avoid marking certain tokens (like second/NN) as numbers...
*
* @param annotation The annotation structure
* @return list of CoreMap representing the identified numbers
*/
public static List<CoreMap> findNumbers(CoreMap annotation) {
List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
for (CoreLabel token : tokens) {
String w = token.word();
w = w.trim().toLowerCase();
if (/*("CD".equals(token.get(CoreAnnotations.PartOfSpeechAnnotation.class)) || */
NumberNormalizer.numPattern.matcher(w).matches() || NumberNormalizer.numberTermPattern2.matcher(w).matches() || NumberSequenceClassifier.ORDINAL_PATTERN.matcher(w).matches() || NumberNormalizer.numEndUnitPattern.matcher(w).matches()) {
// But maybe we don't care, this can just mark the potential numbers, something else can disregard those
try {
token.set(CoreAnnotations.NumericValueAnnotation.class, NumberNormalizer.wordToNumber(w));
if (NumberSequenceClassifier.ORDINAL_PATTERN.matcher(w).find()) {
token.set(CoreAnnotations.NumericTypeAnnotation.class, "ORDINAL");
} else if (NumberNormalizer.numUnitPattern.matcher(w).matches()) {
token.set(CoreAnnotations.NumericTypeAnnotation.class, "UNIT");
} else if (NumberNormalizer.numEndUnitPattern.matcher(w).matches()) {
token.set(CoreAnnotations.NumericTypeAnnotation.class, "UNIT");
} else {
token.set(CoreAnnotations.NumericTypeAnnotation.class, "NUMBER");
}
} catch (Exception ex) {
logger.warning("Error interpreting number " + w + ": " + ex.getMessage());
}
}
}
// TODO: Should we allow "," in written out numbers?
// TODO: Handle "-" that is not with token?
TokenSequenceMatcher matcher = numberPattern.getMatcher(tokens);
List<CoreMap> numbers = new ArrayList<>();
while (matcher.find()) {
@SuppressWarnings("unused") List<CoreMap> matchedTokens = matcher.groupNodes();
int numStart = matcher.start();
int possibleNumEnd = -1;
int lastUnitPos = -1;
int possibleNumStart = -1;
Number possibleNumEndUnit = null;
Number lastUnit = null;
// Check if we need to split matched chunk up more
for (int i = matcher.start(); i < matcher.end(); i++) {
CoreLabel token = tokens.get(i);
CoreLabel prev = (i > matcher.start()) ? tokens.get(i - 1) : null;
Number num = token.get(CoreAnnotations.NumericValueAnnotation.class);
Number prevNum = (prev != null) ? prev.get(CoreAnnotations.NumericValueAnnotation.class) : null;
String w = token.word();
w = w.trim().toLowerCase();
switch(w) {
case ",":
if (lastUnit != null && lastUnitPos == i - 1) {
// OKAY, this may be one big number
possibleNumEnd = i;
possibleNumEndUnit = lastUnit;
} else {
// Not one big number
if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
numStart = i + 1;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
}
if (numStart == i) {
numStart = i + 1;
}
break;
case "and":
// Check if number before and was unit
String prevWord = prev.word();
if (lastUnitPos == i - 1 || (lastUnitPos == i - 2 && ",".equals(prevWord))) {
// Okay
} else {
// Two separate numbers
if (numStart < possibleNumEnd) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
if (possibleNumStart >= possibleNumEnd) {
numStart = possibleNumStart;
} else {
numStart = i + 1;
}
} else if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
numStart = i + 1;
}
if (lastUnitPos < numStart) {
lastUnit = null;
lastUnitPos = -1;
}
possibleNumEnd = -1;
possibleNumEndUnit = null;
}
break;
default:
// NUMBER or ORDINAL
String numType = token.get(CoreAnnotations.NumericTypeAnnotation.class);
if ("UNIT".equals(numType)) {
// Compare this unit with previous
if (lastUnit == null || lastUnit.longValue() > num.longValue()) {
// lastUnit larger than this unit
// maybe four thousand two hundred?
// OKAY, probably one big number
} else {
if (numStart < possibleNumEnd) {
// Not one big number ( had a comma )
if (num.longValue() >= possibleNumEndUnit.longValue()) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
if (possibleNumStart >= possibleNumEnd) {
numStart = possibleNumStart;
} else {
numStart = i;
}
possibleNumEnd = -1;
possibleNumEndUnit = null;
}
} else {
// unit is increasing - can be okay, maybe five hundred thousand?
// what about four hundred five thousand
// unit might also be the same, as in thousand thousand,
// which we convert to million
}
}
lastUnit = num;
lastUnitPos = i;
} else {
// Normal number
if (num == null) {
logger.warning("NO NUMBER: " + token.word());
continue;
}
if (prevNum != null) {
if (num.doubleValue() > 0) {
if (num.doubleValue() < 10) {
// [one to nine] [0-9]
if (NumberNormalizer.numPattern.matcher(prev.word()).matches() || prevNum.longValue() < 10 || prevNum.longValue() % 10 != 0) {
// two separate numbers
if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
}
numStart = i;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
} else {
String prevNumType = prev.get(CoreAnnotations.NumericTypeAnnotation.class);
if ("UNIT".equals(prevNumType)) {
// OKAY
} else if (!ordinalUnitPattern.matcher(w).matches()) {
// Start of new number
if (numStart < i) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
}
numStart = i;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
}
}
}
if ("ORDINAL".equals(numType)) {
if (possibleNumEnd >= 0) {
if (numStart < possibleNumEnd) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
}
if (possibleNumStart > possibleNumEnd) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumStart, i + 1));
} else {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumEnd + 1, i + 1));
}
} else {
if (numStart < i + 1) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i + 1));
}
}
numStart = i + 1;
possibleNumEnd = -1;
possibleNumEndUnit = null;
lastUnit = null;
lastUnitPos = -1;
}
if (possibleNumStart < possibleNumEnd) {
possibleNumStart = i;
}
}
break;
}
}
if (numStart < matcher.end()) {
numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, matcher.end()));
}
}
for (CoreMap n : numbers) {
String exp = n.get(CoreAnnotations.TextAnnotation.class);
if (exp.trim().equals("")) {
continue;
}
List<CoreLabel> ts = n.get(CoreAnnotations.TokensAnnotation.class);
String label = ts.get(ts.size() - 1).get(CoreAnnotations.NumericTypeAnnotation.class);
if ("UNIT".equals(label)) {
label = "NUMBER";
}
try {
Number num = NumberNormalizer.wordToNumber(exp);
if (num == null) {
logger.warning("NO NUMBER FOR: \"" + exp + "\"");
}
n.set(CoreAnnotations.NumericCompositeValueAnnotation.class, num);
n.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, label);
for (CoreLabel t : ts) {
t.set(CoreAnnotations.NumericCompositeValueAnnotation.class, num);
t.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, label);
}
} catch (NumberFormatException ex) {
logger.warning("Invalid number for: \"" + exp + "\"", ex);
}
}
return numbers;
}
use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.
the class ApplyPatterns method call.
@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
// CollectionValuedMap<String, Integer>();
try {
Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
for (String sentid : sentids) {
List<CoreLabel> sent = sents.get(sentid).getTokens();
for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
if (pEn.getKey() == null)
throw new RuntimeException("why is the pattern " + pEn + " null?");
TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
// //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
// m.setFindType(SequenceMatcher.FindType.FIND_ALL);
//Higher branch values makes the faster but uses more memory
m.setBranchLimit(5);
while (m.find()) {
int s = m.start("$term");
int e = m.end("$term");
assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
String phrase = "";
String phraseLemma = "";
boolean useWordNotLabeled = false;
boolean doNotUse = false;
//find if the neighboring words are labeled - if so - club them together
if (constVars.clubNeighboringLabeledWords) {
for (int i = s - 1; i >= 0; i--) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
s = i + 1;
break;
}
}
for (int i = e; i < sent.size(); i++) {
if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
e = i;
break;
}
}
}
//to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
boolean[] addedindices = new boolean[e - s];
Arrays.fill(addedindices, false);
for (int i = s; i < e; i++) {
CoreLabel l = sent.get(i);
l.set(PatternsAnnotations.MatchedPattern.class, true);
if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
SurfacePattern pSur = (SurfacePattern) pEn.getValue();
assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
doNotUse = true;
}
}
boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
if (removePhrasesWithStopWords && containsStop) {
doNotUse = true;
} else {
if (!containsStop || !removeStopWordsFromSelectedPhrases) {
if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
useWordNotLabeled = true;
}
phrase += " " + l.word();
phraseLemma += " " + l.lemma();
addedindices[i - s] = true;
}
}
}
for (int i = 0; i < addedindices.length; i++) {
if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
doNotUse = true;
break;
}
}
if (!doNotUse) {
matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
phrase = phrase.trim();
if (!phrase.isEmpty()) {
phraseLemma = phraseLemma.trim();
CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
if (!useWordNotLabeled)
alreadyLabeledPhrases.add(candPhrase);
}
}
}
}
}
return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
} catch (Exception e) {
e.printStackTrace();
throw e;
}
}
Aggregations