Search in sources :

Example 1 with PaddedList

use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.

the class NERFeatureFactoryITest method testSloppyGazette.

public void testSloppyGazette() {
    List<CoreLabel> sentence = SentenceUtils.toCoreLabelList("For three years , John Bauer has worked at Stanford .".split(" +"));
    PaddedList<CoreLabel> paddedSentence = new PaddedList<CoreLabel>(sentence, new CoreLabel());
    Properties props = new Properties();
    props.setProperty("useGazettes", "true");
    props.setProperty("sloppyGazette", "true");
    props.setProperty("gazette", "projects/core/data/edu/stanford/nlp/ie/test_gazette.txt");
    SeqClassifierFlags flags = new SeqClassifierFlags(props);
    NERFeatureFactory<CoreLabel> factory = new NERFeatureFactory<CoreLabel>();
    factory.init(flags);
    Set<String> features;
    features = new HashSet<String>(factory.featuresC(paddedSentence, 4));
    checkFeatures(features, "BAR-GAZ", "BAZ-GAZ", "FOO-GAZ", "BAR-GAZ2", "BAZ-GAZ2", "FOO-GAZ1", "John-WORD");
    features = new HashSet<String>(factory.featuresC(paddedSentence, 5));
    checkFeatures(features, "BAR-GAZ", "BAZ-GAZ", "BAR-GAZ2", "BAZ-GAZ2", "Bauer-WORD");
    features = new HashSet<String>(factory.featuresC(paddedSentence, 6));
    checkFeatures(features, "has-WORD");
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) PaddedList(edu.stanford.nlp.util.PaddedList) Properties(java.util.Properties) SeqClassifierFlags(edu.stanford.nlp.sequences.SeqClassifierFlags)

Example 2 with PaddedList

use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.

the class CRFBiasedClassifier method makeDatum.

@Override
public CRFDatum<List<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
    pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
    PaddedList<IN> pInfo = new PaddedList<>(info, pad);
    List<List<String>> features = new ArrayList<>();
    Collection<Clique> done = Generics.newHashSet();
    for (int i = 0; i < windowSize; i++) {
        List<String> featuresC = new ArrayList<>();
        List<Clique> windowCliques = FeatureFactory.getCliques(i, 0);
        windowCliques.removeAll(done);
        done.addAll(windowCliques);
        for (Clique c : windowCliques) {
            for (FeatureFactory<IN> featureFactory : featureFactories) {
                featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c));
            }
            if (testTime && i == 0)
                // this feature is only present at test time and only appears
                // in cliques of size 1 (i.e., cliques with window=0)
                featuresC.add(BIAS);
        }
        features.add(featuresC);
    }
    int[] labels = new int[windowSize];
    for (int i = 0; i < windowSize; i++) {
        String answer = pInfo.get(loc + i - windowSize + 1).get(CoreAnnotations.AnswerAnnotation.class);
        labels[i] = classIndex.indexOf(answer);
    }
    return new CRFDatum<>(features, new CRFLabel(labels), null);
}
Also used : PaddedList(edu.stanford.nlp.util.PaddedList) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) Clique(edu.stanford.nlp.sequences.Clique) PaddedList(edu.stanford.nlp.util.PaddedList)

Example 3 with PaddedList

use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.

the class NumberSequenceClassifier method classifyOld.

private List<CoreLabel> classifyOld(List<CoreLabel> document) {
    // if (DEBUG) { log.info("NumberSequenceClassifier tagging"); }
    PaddedList<CoreLabel> pl = new PaddedList<>(document, pad);
    for (int i = 0, sz = pl.size(); i < sz; i++) {
        CoreLabel me = pl.get(i);
        CoreLabel prev = pl.get(i - 1);
        CoreLabel next = pl.get(i + 1);
        CoreLabel next2 = pl.get(i + 2);
        //if (DEBUG) { log.info("Tagging:" + me.word()); }
        me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
        if (CURRENCY_SYMBOL_PATTERN.matcher(me.word()).matches() && (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") || next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD"))) {
            // Penn Treebank ancient # as pound, euro,
            if (DEBUG) {
                log.info("Found currency sign:" + me.word());
            }
            me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
            if (DEBUG) {
                log.info("Tagging CD:" + me.word());
            }
            if (TIME_PATTERN.matcher(me.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
            } else if (TIME_PATTERN2.matcher(me.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
            } else if (DATE_PATTERN.matcher(me.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            } else if (DATE_PATTERN2.matcher(me.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            } else if (next.get(CoreAnnotations.TextAnnotation.class) != null && me.get(CoreAnnotations.TextAnnotation.class) != null && DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches() && MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
                // deterministically make DATE for British-style number before month
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            } else if (prev.get(CoreAnnotations.TextAnnotation.class) != null && MONTH_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() && me.get(CoreAnnotations.TextAnnotation.class) != null && DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches()) {
                // deterministically make DATE for number after month
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            } else if (rightScanFindsMoneyWord(pl, i) && !leftScanFindsWeightWord(pl, i)) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
            } else if (ARMY_TIME_MORNING.matcher(me.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
            } else if (YEAR_PATTERN.matcher(me.word()).matches() && prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") && (MONTH_PATTERN.matcher(prev.word()).matches() || pl.get(i - 2).get(CoreAnnotations.AnswerAnnotation.class).equals("DATE"))) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            } else {
                if (DEBUG) {
                    log.info("Found number:" + me.word());
                }
                if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
                } else {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
                }
            }
        } else if (AM_PM.matcher(me.word()).matches() && prev.get(CoreAnnotations.AnswerAnnotation.class).equals("TIME")) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") && prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") && next.word() != null && YEAR_PATTERN.matcher(next.word()).matches()) {
            me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP") && MONTH_PATTERN.matcher(me.word()).matches()) {
            if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") || next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            }
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC")) {
            if (prev.tag() != null && prev.tag().equals("CD") && next.tag() != null && next.tag().equals("CD") && me.get(CoreAnnotations.TextAnnotation.class) != null && me.get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("and")) {
                if (DEBUG) {
                    log.info("Found number and:" + me.word());
                }
                String wd = prev.word();
                if (wd.equalsIgnoreCase("hundred") || wd.equalsIgnoreCase("thousand") || wd.equalsIgnoreCase("million") || wd.equalsIgnoreCase("billion") || wd.equalsIgnoreCase("trillion")) {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
                }
            }
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NN") || me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS"))) {
            if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches()) {
                if (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") && prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
                }
            } else if (me.word().equals("m") || me.word().equals("b")) {
                // applications
                if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
                } else {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
                }
            } else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
                if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) || (next.word() != null && next.word().equalsIgnoreCase("of") && next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches())) {
                    me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
                }
            } else if (GENERIC_TIME_WORDS.matcher(me.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
            }
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ")) {
            if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) || next.word() != null && next.word().equalsIgnoreCase("of") && next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            } else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
                // don't do other tags: don't want 'second' as noun, or 'first' as adverb
                // introducing reasons
                me.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL");
            }
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("IN") && me.word().equalsIgnoreCase("of")) {
            if (prev.get(CoreAnnotations.TextAnnotation.class) != null && ORDINAL_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() && next.get(CoreAnnotations.TextAnnotation.class) != null && MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
                me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
            }
        }
    }
    return document;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) PaddedList(edu.stanford.nlp.util.PaddedList)

Example 4 with PaddedList

use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.

the class ChineseNumberSequenceClassifier method classify.

/**
   * Use a set of heuristic rules to assign NER tags to tokens.
   * @param document A {@link List} of something that extends {@link CoreMap}.
   * @return
   */
@Override
public List<CoreLabel> classify(List<CoreLabel> document) {
    // The actual implementation of the classifier
    PaddedList<CoreLabel> pl = new PaddedList<>(document, pad);
    for (int i = 0, sz = pl.size(); i < sz; i++) {
        CoreLabel me = pl.get(i);
        CoreLabel prev = pl.get(i - 1);
        CoreLabel next = pl.get(i + 1);
        // by default set to be "O"
        me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
        // If current word is OD, label it as ORDINAL
        if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("OD")) {
            me.set(CoreAnnotations.AnswerAnnotation.class, ORDINAL_TAG);
        } else if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches() && prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
            // If current word is currency word and prev word is a CD
            me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
            // If current word is a CD
            if (PERCENT_WORD_PATTERN1.matcher(me.word()).matches() || PERCENT_WORD_PATTERN2.matcher(me.word()).matches()) {
                // If current word is a percent
                me.set(CoreAnnotations.AnswerAnnotation.class, PERCENT_TAG);
            } else if (rightScanFindsMoneyWord(pl, i)) {
                // If one the right finds a currency word
                me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
            } else if (me.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(me.word()).matches() && DATE_AGE_LOCALIZER.equals(next.word())) {
                // This is to extract a special case of DATE: 70 后 or 七零 后
                me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
            } else {
                // Otherwise we should safely label it as NUMBER
                me.set(CoreAnnotations.AnswerAnnotation.class, NUMBER_TAG);
            }
        } else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NT")) {
            // If current word is a NT (temporal noun)
            if (DATE_PATTERN1.matcher(me.word()).matches() || DATE_PATTERN2.matcher(me.word()).matches() || DATE_PATTERN3.matcher(me.word()).matches() || DATE_PATTERN4.matcher(me.word()).matches() || DATE_PATTERN5.matcher(me.word()).matches() || DATE_WORDS.contains(me.word())) {
                me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
            } else if (TIME_PATTERN1.matcher(me.word()).matches() || TIME_WORDS.contains(me.word())) {
                me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
            } else {
                // TIME may have more variants (really?) so always add as TIME by default
                me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
            }
        } else if (DATE_AGE_LOCALIZER.equals(me.word()) && prev.word() != null && prev.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(prev.word()).matches()) {
            // Label 后 as DATE if the sequence is 70 后 or 七零 后
            me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
        }
    }
    return document;
}
Also used : CoreLabel(edu.stanford.nlp.ling.CoreLabel) CoreAnnotations(edu.stanford.nlp.ling.CoreAnnotations) PaddedList(edu.stanford.nlp.util.PaddedList)

Aggregations

PaddedList (edu.stanford.nlp.util.PaddedList)4 CoreAnnotations (edu.stanford.nlp.ling.CoreAnnotations)3 CoreLabel (edu.stanford.nlp.ling.CoreLabel)3 Clique (edu.stanford.nlp.sequences.Clique)1 SeqClassifierFlags (edu.stanford.nlp.sequences.SeqClassifierFlags)1 Properties (java.util.Properties)1