use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.
the class NERFeatureFactoryITest method testSloppyGazette.
public void testSloppyGazette() {
List<CoreLabel> sentence = SentenceUtils.toCoreLabelList("For three years , John Bauer has worked at Stanford .".split(" +"));
PaddedList<CoreLabel> paddedSentence = new PaddedList<CoreLabel>(sentence, new CoreLabel());
Properties props = new Properties();
props.setProperty("useGazettes", "true");
props.setProperty("sloppyGazette", "true");
props.setProperty("gazette", "projects/core/data/edu/stanford/nlp/ie/test_gazette.txt");
SeqClassifierFlags flags = new SeqClassifierFlags(props);
NERFeatureFactory<CoreLabel> factory = new NERFeatureFactory<CoreLabel>();
factory.init(flags);
Set<String> features;
features = new HashSet<String>(factory.featuresC(paddedSentence, 4));
checkFeatures(features, "BAR-GAZ", "BAZ-GAZ", "FOO-GAZ", "BAR-GAZ2", "BAZ-GAZ2", "FOO-GAZ1", "John-WORD");
features = new HashSet<String>(factory.featuresC(paddedSentence, 5));
checkFeatures(features, "BAR-GAZ", "BAZ-GAZ", "BAR-GAZ2", "BAZ-GAZ2", "Bauer-WORD");
features = new HashSet<String>(factory.featuresC(paddedSentence, 6));
checkFeatures(features, "has-WORD");
}
use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.
the class CRFBiasedClassifier method makeDatum.
@Override
public CRFDatum<List<String>, CRFLabel> makeDatum(List<IN> info, int loc, List<FeatureFactory<IN>> featureFactories) {
pad.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
PaddedList<IN> pInfo = new PaddedList<>(info, pad);
List<List<String>> features = new ArrayList<>();
Collection<Clique> done = Generics.newHashSet();
for (int i = 0; i < windowSize; i++) {
List<String> featuresC = new ArrayList<>();
List<Clique> windowCliques = FeatureFactory.getCliques(i, 0);
windowCliques.removeAll(done);
done.addAll(windowCliques);
for (Clique c : windowCliques) {
for (FeatureFactory<IN> featureFactory : featureFactories) {
featuresC.addAll(featureFactory.getCliqueFeatures(pInfo, loc, c));
}
if (testTime && i == 0)
// this feature is only present at test time and only appears
// in cliques of size 1 (i.e., cliques with window=0)
featuresC.add(BIAS);
}
features.add(featuresC);
}
int[] labels = new int[windowSize];
for (int i = 0; i < windowSize; i++) {
String answer = pInfo.get(loc + i - windowSize + 1).get(CoreAnnotations.AnswerAnnotation.class);
labels[i] = classIndex.indexOf(answer);
}
return new CRFDatum<>(features, new CRFLabel(labels), null);
}
use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.
the class NumberSequenceClassifier method classifyOld.
private List<CoreLabel> classifyOld(List<CoreLabel> document) {
// if (DEBUG) { log.info("NumberSequenceClassifier tagging"); }
PaddedList<CoreLabel> pl = new PaddedList<>(document, pad);
for (int i = 0, sz = pl.size(); i < sz; i++) {
CoreLabel me = pl.get(i);
CoreLabel prev = pl.get(i - 1);
CoreLabel next = pl.get(i + 1);
CoreLabel next2 = pl.get(i + 2);
//if (DEBUG) { log.info("Tagging:" + me.word()); }
me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
if (CURRENCY_SYMBOL_PATTERN.matcher(me.word()).matches() && (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") || next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD"))) {
// Penn Treebank ancient # as pound, euro,
if (DEBUG) {
log.info("Found currency sign:" + me.word());
}
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
if (DEBUG) {
log.info("Tagging CD:" + me.word());
}
if (TIME_PATTERN.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (TIME_PATTERN2.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (DATE_PATTERN.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (DATE_PATTERN2.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (next.get(CoreAnnotations.TextAnnotation.class) != null && me.get(CoreAnnotations.TextAnnotation.class) != null && DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches() && MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
// deterministically make DATE for British-style number before month
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (prev.get(CoreAnnotations.TextAnnotation.class) != null && MONTH_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() && me.get(CoreAnnotations.TextAnnotation.class) != null && DAY_PATTERN.matcher(me.get(CoreAnnotations.TextAnnotation.class)).matches()) {
// deterministically make DATE for number after month
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (rightScanFindsMoneyWord(pl, i) && !leftScanFindsWeightWord(pl, i)) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else if (ARMY_TIME_MORNING.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (YEAR_PATTERN.matcher(me.word()).matches() && prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") && (MONTH_PATTERN.matcher(prev.word()).matches() || pl.get(i - 2).get(CoreAnnotations.AnswerAnnotation.class).equals("DATE"))) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else {
if (DEBUG) {
log.info("Found number:" + me.word());
}
if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else {
me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
}
} else if (AM_PM.matcher(me.word()).matches() && prev.get(CoreAnnotations.AnswerAnnotation.class).equals("TIME")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",") && prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") && next.word() != null && YEAR_PATTERN.matcher(next.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNP") && MONTH_PATTERN.matcher(me.word()).matches()) {
if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("DATE") || next.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC")) {
if (prev.tag() != null && prev.tag().equals("CD") && next.tag() != null && next.tag().equals("CD") && me.get(CoreAnnotations.TextAnnotation.class) != null && me.get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("and")) {
if (DEBUG) {
log.info("Found number and:" + me.word());
}
String wd = prev.word();
if (wd.equalsIgnoreCase("hundred") || wd.equalsIgnoreCase("thousand") || wd.equalsIgnoreCase("million") || wd.equalsIgnoreCase("billion") || wd.equalsIgnoreCase("trillion")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class) != null && (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NN") || me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NNS"))) {
if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches()) {
if (prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD") && prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
}
} else if (me.word().equals("m") || me.word().equals("b")) {
// applications
if (prev.getString(CoreAnnotations.AnswerAnnotation.class).equals("MONEY")) {
me.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
} else {
me.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
} else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) || (next.word() != null && next.word().equalsIgnoreCase("of") && next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches())) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
}
} else if (GENERIC_TIME_WORDS.matcher(me.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "TIME");
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ")) {
if ((next.word() != null && MONTH_PATTERN.matcher(next.word()).matches()) || next.word() != null && next.word().equalsIgnoreCase("of") && next2.word() != null && MONTH_PATTERN.matcher(next2.word()).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
} else if (ORDINAL_PATTERN.matcher(me.word()).matches()) {
// don't do other tags: don't want 'second' as noun, or 'first' as adverb
// introducing reasons
me.set(CoreAnnotations.AnswerAnnotation.class, "ORDINAL");
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("IN") && me.word().equalsIgnoreCase("of")) {
if (prev.get(CoreAnnotations.TextAnnotation.class) != null && ORDINAL_PATTERN.matcher(prev.get(CoreAnnotations.TextAnnotation.class)).matches() && next.get(CoreAnnotations.TextAnnotation.class) != null && MONTH_PATTERN.matcher(next.get(CoreAnnotations.TextAnnotation.class)).matches()) {
me.set(CoreAnnotations.AnswerAnnotation.class, "DATE");
}
}
}
return document;
}
use of edu.stanford.nlp.util.PaddedList in project CoreNLP by stanfordnlp.
the class ChineseNumberSequenceClassifier method classify.
/**
* Use a set of heuristic rules to assign NER tags to tokens.
* @param document A {@link List} of something that extends {@link CoreMap}.
* @return
*/
@Override
public List<CoreLabel> classify(List<CoreLabel> document) {
// The actual implementation of the classifier
PaddedList<CoreLabel> pl = new PaddedList<>(document, pad);
for (int i = 0, sz = pl.size(); i < sz; i++) {
CoreLabel me = pl.get(i);
CoreLabel prev = pl.get(i - 1);
CoreLabel next = pl.get(i + 1);
// by default set to be "O"
me.set(CoreAnnotations.AnswerAnnotation.class, flags.backgroundSymbol);
// If current word is OD, label it as ORDINAL
if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("OD")) {
me.set(CoreAnnotations.AnswerAnnotation.class, ORDINAL_TAG);
} else if (CURRENCY_WORD_PATTERN.matcher(me.word()).matches() && prev.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
// If current word is currency word and prev word is a CD
me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CD")) {
// If current word is a CD
if (PERCENT_WORD_PATTERN1.matcher(me.word()).matches() || PERCENT_WORD_PATTERN2.matcher(me.word()).matches()) {
// If current word is a percent
me.set(CoreAnnotations.AnswerAnnotation.class, PERCENT_TAG);
} else if (rightScanFindsMoneyWord(pl, i)) {
// If one the right finds a currency word
me.set(CoreAnnotations.AnswerAnnotation.class, MONEY_TAG);
} else if (me.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(me.word()).matches() && DATE_AGE_LOCALIZER.equals(next.word())) {
// This is to extract a special case of DATE: 70 后 or 七零 后
me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
} else {
// Otherwise we should safely label it as NUMBER
me.set(CoreAnnotations.AnswerAnnotation.class, NUMBER_TAG);
}
} else if (me.getString(CoreAnnotations.PartOfSpeechAnnotation.class).equals("NT")) {
// If current word is a NT (temporal noun)
if (DATE_PATTERN1.matcher(me.word()).matches() || DATE_PATTERN2.matcher(me.word()).matches() || DATE_PATTERN3.matcher(me.word()).matches() || DATE_PATTERN4.matcher(me.word()).matches() || DATE_PATTERN5.matcher(me.word()).matches() || DATE_WORDS.contains(me.word())) {
me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
} else if (TIME_PATTERN1.matcher(me.word()).matches() || TIME_WORDS.contains(me.word())) {
me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
} else {
// TIME may have more variants (really?) so always add as TIME by default
me.set(CoreAnnotations.AnswerAnnotation.class, TIME_TAG);
}
} else if (DATE_AGE_LOCALIZER.equals(me.word()) && prev.word() != null && prev.word().length() == 2 && CHINESE_AND_ARABIC_NUMERALS_PATTERN.matcher(prev.word()).matches()) {
// Label 后 as DATE if the sequence is 70 后 or 七零 后
me.set(CoreAnnotations.AnswerAnnotation.class, DATE_TAG);
}
}
return document;
}
Aggregations