Search in sources :

Example 1 with TokenSequenceMatcher

use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.

the class NumberNormalizer method findNumberRanges.

public static List<CoreMap> findNumberRanges(CoreMap annotation) {
    List<CoreMap> numerizedTokens = annotation.get(CoreAnnotations.NumerizedTokensAnnotation.class);
    for (CoreMap token : numerizedTokens) {
        String w = token.get(CoreAnnotations.TextAnnotation.class);
        w = w.trim().toLowerCase();
        Matcher rangeMatcher = NumberNormalizer.numRangePattern.matcher(w);
        if (rangeMatcher.matches()) {
            try {
                String w1 = rangeMatcher.group(1);
                String w2 = rangeMatcher.group(2);
                Number v1 = NumberNormalizer.wordToNumber(w1);
                Number v2 = NumberNormalizer.wordToNumber(w2);
                if (v2.doubleValue() > v1.doubleValue()) {
                    token.set(CoreAnnotations.NumericTypeAnnotation.class, "NUMBER_RANGE");
                    token.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE");
                    Pair<Number, Number> range = new Pair<>(v1, v2);
                    token.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range);
                }
            } catch (Exception ex) {
                logger.warning("Error interpreting number range " + w + ": " + ex.getMessage());
            }
        }
    }
    List<CoreMap> numberRanges = new ArrayList<>();
    TokenSequenceMatcher matcher = rangePattern.getMatcher(numerizedTokens);
    while (matcher.find()) {
        List<CoreMap> matched = matcher.groupNodes();
        if (matched.size() == 1) {
            numberRanges.add(matched.get(0));
        } else {
            Number v1 = matched.get(0).get(CoreAnnotations.NumericCompositeValueAnnotation.class);
            Number v2 = matched.get(matched.size() - 1).get(CoreAnnotations.NumericCompositeValueAnnotation.class);
            if (v2.doubleValue() > v1.doubleValue()) {
                CoreMap newChunk = CoreMapAggregator.getDefaultAggregator().merge(numerizedTokens, matcher.start(), matcher.end());
                newChunk.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE");
                Pair<Number, Number> range = new Pair<>(v1, v2);
                newChunk.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range);
                numberRanges.add(newChunk);
            }
        }
    }
    return numberRanges;
}
Also used : Matcher(java.util.regex.Matcher) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 2 with TokenSequenceMatcher

use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.

the class RelationTripleSegmenter method extract.

/**
   * Extract the nominal patterns from this sentence.
   *
   * @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
   * @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
   *
   * @param parse The parse tree of the sentence to annotate.
   * @param tokens The tokens of the sentence to annotate.
   * @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
   */
@SuppressWarnings("unchecked")
public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
    List<RelationTriple> extractions = new ArrayList<>();
    Set<Triple<Span, String, Span>> alreadyExtracted = new HashSet<>();
    //
    for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
        TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
        while (tokenMatcher.find()) {
            boolean missingPrefixBe;
            boolean missingSuffixOf = false;
            // Create subject
            List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
            Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
            List<CoreLabel> subjectTokens = new ArrayList<>();
            for (int i : subjectSpan) {
                subjectTokens.add(tokens.get(i));
            }
            // Create object
            List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
            Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
            if (Span.overlaps(subjectSpan, objectSpan)) {
                continue;
            }
            List<CoreLabel> objectTokens = new ArrayList<>();
            for (int i : objectSpan) {
                objectTokens.add(tokens.get(i));
            }
            // Create relation
            if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
                List<CoreLabel> relationTokens = new ArrayList<>();
                // (add the 'be')
                missingPrefixBe = true;
                // (add a complement to the 'be')
                List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
                if (beofComp != null) {
                    // (add the complement
                    for (CoreMap token : beofComp) {
                        if (token instanceof CoreLabel) {
                            relationTokens.add((CoreLabel) token);
                        } else {
                            relationTokens.add(new CoreLabel(token));
                        }
                    }
                    // (add the 'of')
                    missingSuffixOf = true;
                }
                // Add extraction
                String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
                if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
                    RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
                    //noinspection ConstantConditions
                    extraction.isPrefixBe(missingPrefixBe);
                    extraction.isSuffixOf(missingSuffixOf);
                    extractions.add(extraction);
                    alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
                }
            }
        }
        //
        for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
            SemgrexMatcher matcher = semgrex.matcher(parse);
            while (matcher.find()) {
                boolean missingPrefixBe = false;
                boolean missingSuffixBe = false;
                boolean istmod = false;
                // Get relaux if applicable
                String relaux = matcher.getRelnString("relaux");
                String ignoredArc = relaux;
                if (ignoredArc == null) {
                    ignoredArc = matcher.getRelnString("arc");
                }
                // Create subject
                IndexedWord subject = matcher.getNode("subject");
                List<IndexedWord> subjectTokens = new ArrayList<>();
                Span subjectSpan;
                if (subject.ner() != null && !"O".equals(subject.ner())) {
                    subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
                    for (int i : subjectSpan) {
                        subjectTokens.add(new IndexedWord(tokens.get(i)));
                    }
                } else {
                    subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
                    subjectSpan = Util.tokensToSpan(subjectTokens);
                }
                // Create object
                IndexedWord object = matcher.getNode("object");
                List<IndexedWord> objectTokens = new ArrayList<>();
                Span objectSpan;
                if (object.ner() != null && !"O".equals(object.ner())) {
                    objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
                    for (int i : objectSpan) {
                        objectTokens.add(new IndexedWord(tokens.get(i)));
                    }
                } else {
                    objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
                    objectSpan = Util.tokensToSpan(objectTokens);
                }
                // Check that the pair is valid
                if (Span.overlaps(subjectSpan, objectSpan)) {
                    // We extracted an identity
                    continue;
                }
                if (subjectSpan.end() == objectSpan.start() - 1 && (tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(subjectSpan.end()).tag()))) {
                    // We're straddling a clause
                    continue;
                }
                if (objectSpan.end() == subjectSpan.start() - 1 && (tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") || "CC".equals(tokens.get(objectSpan.end()).tag()))) {
                    // We're straddling a clause
                    continue;
                }
                // Get any prepositional edges
                String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
                IndexedWord prepWord = null;
                // (these usually come from the object)
                boolean prepositionIsPrefix = false;
                for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
                    if (edge.getRelation().toString().equals("case")) {
                        prepWord = edge.getDependent();
                    }
                }
                // (...but sometimes from the subject)
                if (prepWord == null) {
                    for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
                        if (edge.getRelation().toString().equals("case")) {
                            prepositionIsPrefix = true;
                            prepWord = edge.getDependent();
                        }
                    }
                }
                List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
                if (prepWord != null && !expected.equals("tmod")) {
                    Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
                    if (!optionalPrepChunk.isPresent()) {
                        continue;
                    }
                    prepChunk = optionalPrepChunk.get();
                    Collections.sort(prepChunk, (a, b) -> {
                        double val = a.pseudoPosition() - b.pseudoPosition();
                        if (val < 0) {
                            return -1;
                        }
                        if (val > 0) {
                            return 1;
                        } else {
                            return 0;
                        }
                    });
                // ascending sort
                }
                // Get the relation
                if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
                    LinkedList<IndexedWord> relationTokens = new LinkedList<>();
                    IndexedWord relNode = matcher.getNode("relation");
                    if (relNode != null) {
                        // Case: we have a grounded relation span
                        // (add the relation)
                        relationTokens.add(relNode);
                        // (add any prepositional case markings)
                        if (prepositionIsPrefix) {
                            // We're almost certainly missing a suffix 'be'
                            missingSuffixBe = true;
                            for (int i = prepChunk.size() - 1; i >= 0; --i) {
                                relationTokens.addFirst(prepChunk.get(i));
                            }
                        } else {
                            relationTokens.addAll(prepChunk);
                        }
                        if (expected.equalsIgnoreCase("tmod")) {
                            istmod = true;
                        }
                    } else {
                        // (mark it as missing a preceding 'be'
                        if (!expected.equals("poss")) {
                            missingPrefixBe = true;
                        }
                        // (add any prepositional case markings)
                        if (prepositionIsPrefix) {
                            for (int i = prepChunk.size() - 1; i >= 0; --i) {
                                relationTokens.addFirst(prepChunk.get(i));
                            }
                        } else {
                            relationTokens.addAll(prepChunk);
                        }
                        if (expected.equalsIgnoreCase("tmod")) {
                            istmod = true;
                        }
                        // (some fine-tuning)
                        if (allowNominalsWithoutNER && "of".equals(expected)) {
                            // prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
                            continue;
                        }
                    }
                    // Add extraction
                    String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
                    if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
                        RelationTriple extraction = new RelationTriple(subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()), objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
                        extraction.istmod(istmod);
                        extraction.isPrefixBe(missingPrefixBe);
                        extraction.isSuffixBe(missingSuffixBe);
                        extractions.add(extraction);
                        alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
                    }
                }
            }
        }
    }
    //
    // Filter downward polarity extractions
    //
    Iterator<RelationTriple> iter = extractions.iterator();
    while (iter.hasNext()) {
        RelationTriple term = iter.next();
        boolean shouldRemove = true;
        for (CoreLabel token : term) {
            if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null || !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
                shouldRemove = false;
            }
        }
        if (shouldRemove) {
            // Don't extract things in downward polarity contexts.
            iter.remove();
        }
    }
    // Return
    return extractions;
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) Span(edu.stanford.nlp.ie.machinereading.structure.Span) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) SemgrexPattern(edu.stanford.nlp.semgraph.semgrex.SemgrexPattern) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) SemanticGraphEdge(edu.stanford.nlp.semgraph.SemanticGraphEdge) RelationTriple(edu.stanford.nlp.ie.util.RelationTriple) CoreLabel(edu.stanford.nlp.ling.CoreLabel) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 3 with TokenSequenceMatcher

use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.

the class NaturalLogicAnnotator method annotateUnaries.

/**
   * Annotate any unary quantifiers that weren't found in the main {@link NaturalLogicAnnotator#annotateOperators(CoreMap)} method.
   * @param sentence The sentence to annotate.
   */
private static void annotateUnaries(CoreMap sentence) {
    // Get tree and tokens
    SemanticGraph tree = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
    if (tree == null) {
        tree = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    }
    List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
    // Get operator exists mask
    boolean[] isOperator = new boolean[tokens.size()];
    for (int i = 0; i < isOperator.length; ++i) {
        OperatorSpec spec = tokens.get(i).get(OperatorAnnotation.class);
        if (spec != null) {
            for (int k = spec.quantifierBegin; k < spec.quantifierEnd; ++k) {
                isOperator[k] = true;
            }
        }
    }
    // Match Semgrex
    SemgrexMatcher matcher = UNARY_PATTERN.matcher(tree);
    while (matcher.find()) {
        // Get relevant nodes
        IndexedWord quantifier = matcher.getNode("quantifier");
        String word = quantifier.word().toLowerCase();
        if (word.equals("a") || word.equals("an") || word.equals("the") || "CD".equals(quantifier.tag())) {
            // These are absurdly common, and uninformative, and we're just going to shoot ourselves in the foot from parsing errors and idiomatic expressions.
            continue;
        }
        IndexedWord subject = matcher.getNode("subject");
        // ... If there is not already an operator there
        if (!isOperator[quantifier.index() - 1]) {
            Optional<Triple<Operator, Integer, Integer>> quantifierInfo = validateQuantifierByHead(sentence, quantifier);
            // ... and if we found a quantifier span
            if (quantifierInfo.isPresent()) {
                // Then add the unary operator!
                OperatorSpec scope = computeScope(tree, quantifierInfo.get().first, subject, Pair.makePair(quantifierInfo.get().second, quantifierInfo.get().third), null, false, null, tokens.size());
                CoreLabel token = tokens.get(quantifier.index() - 1);
                token.set(OperatorAnnotation.class, scope);
            }
        }
    }
    // Match TokensRegex
    TokenSequenceMatcher tokenMatcher = DOUBT_PATTERN.matcher(tokens);
    while (tokenMatcher.find()) {
        List<CoreLabel> doubt = (List<CoreLabel>) tokenMatcher.groupNodes("$doubt");
        List<CoreLabel> target = (List<CoreLabel>) tokenMatcher.groupNodes("$target");
        for (CoreLabel word : doubt) {
            OperatorSpec spec = new OperatorSpec(Operator.GENERAL_NEG_POLARITY, word.index() - 1, word.index(), target.get(0).index() - 1, target.get(target.size() - 1).index(), 0, 0, tokens.size());
            word.set(OperatorAnnotation.class, spec);
        }
    }
}
Also used : SemgrexMatcher(edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) SemanticGraphCoreAnnotations(edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations) SemanticGraph(edu.stanford.nlp.semgraph.SemanticGraph) IndexedWord(edu.stanford.nlp.ling.IndexedWord)

Example 4 with TokenSequenceMatcher

use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.

the class NumberNormalizer method findNumbers.

/**
   * Find and mark numbers (does not need NumberSequenceClassifier)
   * Each token is annotated with the numeric value and type
   * - CoreAnnotations.NumericTypeAnnotation.class: ORDINAL, UNIT (hundred, thousand,..., dozen, gross,...), NUMBER
   * - CoreAnnotations.NumericValueAnnotation.class: Number representing the numeric value of the token
   *   ( two thousand =&gt; 2 1000 )
   *
   * Tries also to separate individual numbers like four five six,
   *   while keeping numbers like four hundred and seven together
   * Annotate tokens belonging to each composite number with
   * - CoreAnnotations.NumericCompositeTypeAnnotation.class: ORDINAL (1st, 2nd), NUMBER (one hundred)
   * - CoreAnnotations.NumericCompositeValueAnnotation.class: Number representing the composite numeric value
   *   ( two thousand =&gt; 2000 2000 )
   *
   * Also returns list of CoreMap representing the identified numbers
   *
   * The function is overly aggressive in marking possible numbers
   *  - should either do more checks or use in conjunction with NumberSequenceClassifier
   *    to avoid marking certain tokens (like second/NN) as numbers...
   *
   * @param annotation The annotation structure
   * @return list of CoreMap representing the identified numbers
   */
public static List<CoreMap> findNumbers(CoreMap annotation) {
    List<CoreLabel> tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    for (CoreLabel token : tokens) {
        String w = token.word();
        w = w.trim().toLowerCase();
        if (/*("CD".equals(token.get(CoreAnnotations.PartOfSpeechAnnotation.class))  || */
        NumberNormalizer.numPattern.matcher(w).matches() || NumberNormalizer.numberTermPattern2.matcher(w).matches() || NumberSequenceClassifier.ORDINAL_PATTERN.matcher(w).matches() || NumberNormalizer.numEndUnitPattern.matcher(w).matches()) {
            // But maybe we don't care, this can just mark the potential numbers, something else can disregard those
            try {
                token.set(CoreAnnotations.NumericValueAnnotation.class, NumberNormalizer.wordToNumber(w));
                if (NumberSequenceClassifier.ORDINAL_PATTERN.matcher(w).find()) {
                    token.set(CoreAnnotations.NumericTypeAnnotation.class, "ORDINAL");
                } else if (NumberNormalizer.numUnitPattern.matcher(w).matches()) {
                    token.set(CoreAnnotations.NumericTypeAnnotation.class, "UNIT");
                } else if (NumberNormalizer.numEndUnitPattern.matcher(w).matches()) {
                    token.set(CoreAnnotations.NumericTypeAnnotation.class, "UNIT");
                } else {
                    token.set(CoreAnnotations.NumericTypeAnnotation.class, "NUMBER");
                }
            } catch (Exception ex) {
                logger.warning("Error interpreting number " + w + ": " + ex.getMessage());
            }
        }
    }
    // TODO: Should we allow "," in written out numbers?
    // TODO: Handle "-" that is not with token?
    TokenSequenceMatcher matcher = numberPattern.getMatcher(tokens);
    List<CoreMap> numbers = new ArrayList<>();
    while (matcher.find()) {
        @SuppressWarnings("unused") List<CoreMap> matchedTokens = matcher.groupNodes();
        int numStart = matcher.start();
        int possibleNumEnd = -1;
        int lastUnitPos = -1;
        int possibleNumStart = -1;
        Number possibleNumEndUnit = null;
        Number lastUnit = null;
        // Check if we need to split matched chunk up more
        for (int i = matcher.start(); i < matcher.end(); i++) {
            CoreLabel token = tokens.get(i);
            CoreLabel prev = (i > matcher.start()) ? tokens.get(i - 1) : null;
            Number num = token.get(CoreAnnotations.NumericValueAnnotation.class);
            Number prevNum = (prev != null) ? prev.get(CoreAnnotations.NumericValueAnnotation.class) : null;
            String w = token.word();
            w = w.trim().toLowerCase();
            switch(w) {
                case ",":
                    if (lastUnit != null && lastUnitPos == i - 1) {
                        // OKAY, this may be one big number
                        possibleNumEnd = i;
                        possibleNumEndUnit = lastUnit;
                    } else {
                        // Not one big number
                        if (numStart < i) {
                            numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                            numStart = i + 1;
                            possibleNumEnd = -1;
                            possibleNumEndUnit = null;
                            lastUnit = null;
                            lastUnitPos = -1;
                        }
                    }
                    if (numStart == i) {
                        numStart = i + 1;
                    }
                    break;
                case "and":
                    // Check if number before and was unit
                    String prevWord = prev.word();
                    if (lastUnitPos == i - 1 || (lastUnitPos == i - 2 && ",".equals(prevWord))) {
                    // Okay
                    } else {
                        // Two separate numbers
                        if (numStart < possibleNumEnd) {
                            numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                            if (possibleNumStart >= possibleNumEnd) {
                                numStart = possibleNumStart;
                            } else {
                                numStart = i + 1;
                            }
                        } else if (numStart < i) {
                            numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                            numStart = i + 1;
                        }
                        if (lastUnitPos < numStart) {
                            lastUnit = null;
                            lastUnitPos = -1;
                        }
                        possibleNumEnd = -1;
                        possibleNumEndUnit = null;
                    }
                    break;
                default:
                    // NUMBER or ORDINAL
                    String numType = token.get(CoreAnnotations.NumericTypeAnnotation.class);
                    if ("UNIT".equals(numType)) {
                        // Compare this unit with previous
                        if (lastUnit == null || lastUnit.longValue() > num.longValue()) {
                        // lastUnit larger than this unit
                        // maybe four thousand two hundred?
                        // OKAY, probably one big number
                        } else {
                            if (numStart < possibleNumEnd) {
                                // Not one big number  ( had a comma )
                                if (num.longValue() >= possibleNumEndUnit.longValue()) {
                                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                                    if (possibleNumStart >= possibleNumEnd) {
                                        numStart = possibleNumStart;
                                    } else {
                                        numStart = i;
                                    }
                                    possibleNumEnd = -1;
                                    possibleNumEndUnit = null;
                                }
                            } else {
                            // unit is increasing - can be okay, maybe five hundred thousand?
                            // what about four hundred five thousand
                            // unit might also be the same, as in thousand thousand,
                            // which we convert to million
                            }
                        }
                        lastUnit = num;
                        lastUnitPos = i;
                    } else {
                        // Normal number
                        if (num == null) {
                            logger.warning("NO NUMBER: " + token.word());
                            continue;
                        }
                        if (prevNum != null) {
                            if (num.doubleValue() > 0) {
                                if (num.doubleValue() < 10) {
                                    //    [one to nine]  [0-9]
                                    if (NumberNormalizer.numPattern.matcher(prev.word()).matches() || prevNum.longValue() < 10 || prevNum.longValue() % 10 != 0) {
                                        // two separate numbers
                                        if (numStart < i) {
                                            numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                                        }
                                        numStart = i;
                                        possibleNumEnd = -1;
                                        possibleNumEndUnit = null;
                                        lastUnit = null;
                                        lastUnitPos = -1;
                                    }
                                } else {
                                    String prevNumType = prev.get(CoreAnnotations.NumericTypeAnnotation.class);
                                    if ("UNIT".equals(prevNumType)) {
                                    // OKAY
                                    } else if (!ordinalUnitPattern.matcher(w).matches()) {
                                        // Start of new number
                                        if (numStart < i) {
                                            numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i));
                                        }
                                        numStart = i;
                                        possibleNumEnd = -1;
                                        possibleNumEndUnit = null;
                                        lastUnit = null;
                                        lastUnitPos = -1;
                                    }
                                }
                            }
                        }
                        if ("ORDINAL".equals(numType)) {
                            if (possibleNumEnd >= 0) {
                                if (numStart < possibleNumEnd) {
                                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, possibleNumEnd));
                                }
                                if (possibleNumStart > possibleNumEnd) {
                                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumStart, i + 1));
                                } else {
                                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, possibleNumEnd + 1, i + 1));
                                }
                            } else {
                                if (numStart < i + 1) {
                                    numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, i + 1));
                                }
                            }
                            numStart = i + 1;
                            possibleNumEnd = -1;
                            possibleNumEndUnit = null;
                            lastUnit = null;
                            lastUnitPos = -1;
                        }
                        if (possibleNumStart < possibleNumEnd) {
                            possibleNumStart = i;
                        }
                    }
                    break;
            }
        }
        if (numStart < matcher.end()) {
            numbers.add(ChunkAnnotationUtils.getAnnotatedChunk(annotation, numStart, matcher.end()));
        }
    }
    for (CoreMap n : numbers) {
        String exp = n.get(CoreAnnotations.TextAnnotation.class);
        if (exp.trim().equals("")) {
            continue;
        }
        List<CoreLabel> ts = n.get(CoreAnnotations.TokensAnnotation.class);
        String label = ts.get(ts.size() - 1).get(CoreAnnotations.NumericTypeAnnotation.class);
        if ("UNIT".equals(label)) {
            label = "NUMBER";
        }
        try {
            Number num = NumberNormalizer.wordToNumber(exp);
            if (num == null) {
                logger.warning("NO NUMBER FOR: \"" + exp + "\"");
            }
            n.set(CoreAnnotations.NumericCompositeValueAnnotation.class, num);
            n.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, label);
            for (CoreLabel t : ts) {
                t.set(CoreAnnotations.NumericCompositeValueAnnotation.class, num);
                t.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, label);
            }
        } catch (NumberFormatException ex) {
            logger.warning("Invalid number for: \"" + exp + "\"", ex);
        }
    }
    return numbers;
}
Also used : TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations)

Example 5 with TokenSequenceMatcher

use of edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher in project CoreNLP by stanfordnlp.

the class ApplyPatterns method call.

@Override
public Triple<TwoDimensionalCounter<CandidatePhrase, E>, CollectionValuedMap<E, Triple<String, Integer, Integer>>, Set<CandidatePhrase>> call() throws Exception {
    // CollectionValuedMap<String, Integer>();
    try {
        Set<CandidatePhrase> alreadyLabeledPhrases = new HashSet<>();
        TwoDimensionalCounter<CandidatePhrase, E> allFreq = new TwoDimensionalCounter<>();
        CollectionValuedMap<E, Triple<String, Integer, Integer>> matchedTokensByPat = new CollectionValuedMap<>();
        for (String sentid : sentids) {
            List<CoreLabel> sent = sents.get(sentid).getTokens();
            for (Entry<TokenSequencePattern, E> pEn : patterns.entrySet()) {
                if (pEn.getKey() == null)
                    throw new RuntimeException("why is the pattern " + pEn + " null?");
                TokenSequenceMatcher m = pEn.getKey().getMatcher(sent);
                //        //Setting this find type can save time in searching - greedy and reluctant quantifiers are not enforced
                //        m.setFindType(SequenceMatcher.FindType.FIND_ALL);
                //Higher branch values makes the faster but uses more memory
                m.setBranchLimit(5);
                while (m.find()) {
                    int s = m.start("$term");
                    int e = m.end("$term");
                    assert e - s <= PatternFactory.numWordsCompoundMapped.get(label) : "How come the pattern " + pEn.getKey() + " is extracting phrases longer than numWordsCompound of " + PatternFactory.numWordsCompoundMapped.get(label) + " for label " + label;
                    String phrase = "";
                    String phraseLemma = "";
                    boolean useWordNotLabeled = false;
                    boolean doNotUse = false;
                    //find if the neighboring words are labeled - if so - club them together
                    if (constVars.clubNeighboringLabeledWords) {
                        for (int i = s - 1; i >= 0; i--) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                s = i + 1;
                                break;
                            }
                        }
                        for (int i = e; i < sent.size(); i++) {
                            if (!sent.get(i).get(constVars.getAnswerClass().get(label)).equals(label)) {
                                e = i;
                                break;
                            }
                        }
                    }
                    //to make sure we discard phrases with stopwords in between, but include the ones in which stop words were removed at the ends if removeStopWordsFromSelectedPhrases is true
                    boolean[] addedindices = new boolean[e - s];
                    Arrays.fill(addedindices, false);
                    for (int i = s; i < e; i++) {
                        CoreLabel l = sent.get(i);
                        l.set(PatternsAnnotations.MatchedPattern.class, true);
                        if (!l.containsKey(PatternsAnnotations.MatchedPatterns.class) || l.get(PatternsAnnotations.MatchedPatterns.class) == null)
                            l.set(PatternsAnnotations.MatchedPatterns.class, new HashSet<>());
                        SurfacePattern pSur = (SurfacePattern) pEn.getValue();
                        assert pSur != null : "Why is " + pEn.getValue() + " not present in the index?!";
                        assert l.get(PatternsAnnotations.MatchedPatterns.class) != null : "How come MatchedPatterns class is null for the token. The classes in the key set are " + l.keySet();
                        l.get(PatternsAnnotations.MatchedPatterns.class).add(pSur);
                        for (Entry<Class, Object> ig : constVars.getIgnoreWordswithClassesDuringSelection().get(label).entrySet()) {
                            if (l.containsKey(ig.getKey()) && l.get(ig.getKey()).equals(ig.getValue())) {
                                doNotUse = true;
                            }
                        }
                        boolean containsStop = containsStopWord(l, constVars.getCommonEngWords(), PatternFactory.ignoreWordRegex);
                        if (removePhrasesWithStopWords && containsStop) {
                            doNotUse = true;
                        } else {
                            if (!containsStop || !removeStopWordsFromSelectedPhrases) {
                                if (label == null || l.get(constVars.getAnswerClass().get(label)) == null || !l.get(constVars.getAnswerClass().get(label)).equals(label.toString())) {
                                    useWordNotLabeled = true;
                                }
                                phrase += " " + l.word();
                                phraseLemma += " " + l.lemma();
                                addedindices[i - s] = true;
                            }
                        }
                    }
                    for (int i = 0; i < addedindices.length; i++) {
                        if (i > 0 && i < addedindices.length - 1 && addedindices[i - 1] == true && addedindices[i] == false && addedindices[i + 1] == true) {
                            doNotUse = true;
                            break;
                        }
                    }
                    if (!doNotUse) {
                        matchedTokensByPat.add(pEn.getValue(), new Triple<>(sentid, s, e - 1));
                        phrase = phrase.trim();
                        if (!phrase.isEmpty()) {
                            phraseLemma = phraseLemma.trim();
                            CandidatePhrase candPhrase = CandidatePhrase.createOrGet(phrase, phraseLemma);
                            allFreq.incrementCount(candPhrase, pEn.getValue(), 1.0);
                            if (!useWordNotLabeled)
                                alreadyLabeledPhrases.add(candPhrase);
                        }
                    }
                }
            }
        }
        return new Triple<>(allFreq, matchedTokensByPat, alreadyLabeledPhrases);
    } catch (Exception e) {
        e.printStackTrace();
        throw e;
    }
}
Also used : CollectionValuedMap(edu.stanford.nlp.util.CollectionValuedMap) TokenSequencePattern(edu.stanford.nlp.ling.tokensregex.TokenSequencePattern) TokenSequenceMatcher(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) TwoDimensionalCounter(edu.stanford.nlp.stats.TwoDimensionalCounter) Triple(edu.stanford.nlp.util.Triple) CoreLabel(edu.stanford.nlp.ling.CoreLabel)

Aggregations

TokenSequenceMatcher (edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher)6 CoreLabel (edu.stanford.nlp.ling.CoreLabel)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 IndexedWord (edu.stanford.nlp.ling.IndexedWord)2 TokenSequencePattern (edu.stanford.nlp.ling.tokensregex.TokenSequencePattern)2 SemgrexMatcher (edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher)2 Span (edu.stanford.nlp.ie.machinereading.structure.Span)1 RelationTriple (edu.stanford.nlp.ie.util.RelationTriple)1 SemanticGraph (edu.stanford.nlp.semgraph.SemanticGraph)1 SemanticGraphCoreAnnotations (edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations)1 SemanticGraphEdge (edu.stanford.nlp.semgraph.SemanticGraphEdge)1 SemgrexPattern (edu.stanford.nlp.semgraph.semgrex.SemgrexPattern)1 TwoDimensionalCounter (edu.stanford.nlp.stats.TwoDimensionalCounter)1 CollectionValuedMap (edu.stanford.nlp.util.CollectionValuedMap)1 Triple (edu.stanford.nlp.util.Triple)1 Matcher (java.util.regex.Matcher)1