use of edu.illinois.cs.cogcomp.edison.features.Feature in project cogcomp-nlp by CogComp.
the class WordTypeInformation method getFeatures.
@Override
public /**
* This feature extractor assumes that the TOKEN View has been generated in the Constituents TextAnnotation.
* It generate a feature for a window [-2, +2] of Forms (original text) for each constituent.
*
**/
Set<Feature> getFeatures(Constituent c) throws EdisonException {
TextAnnotation ta = c.getTextAnnotation();
TOKENS = ta.getView(ViewNames.TOKENS);
// We can assume that the constituent in this case is a Word(Token) described by the LBJ
// chunk definition
int startspan = c.getStartSpan();
int endspan = c.getEndSpan();
int k = 2;
// All our constituents are words(tokens)
String[] forms = getwindowkfrom(TOKENS, startspan, endspan, 2);
String __id, __value;
String classifier = "WordTypeInformation";
Set<Feature> __result = new LinkedHashSet<Feature>();
for (int i = 0; i < forms.length; i++) {
if (forms[i] != null) {
boolean allCapitalized = true, allDigits = true, allNonLetters = true;
for (int j = 0; j < forms[i].length(); ++j) {
allCapitalized &= Character.isUpperCase(forms[i].charAt(j));
allDigits &= Character.isDigit(forms[i].charAt(j));
allNonLetters &= !Character.isLetter(forms[i].charAt(j));
}
__id = classifier + ":" + ("c" + i);
__value = "(" + (allCapitalized) + ")";
__result.add(new DiscreteFeature(__id + __value));
__id = classifier + ":" + ("d" + i);
__value = "(" + (allDigits) + ")";
__result.add(new DiscreteFeature(__id + __value));
__id = classifier + ":" + ("c" + i);
__value = "(" + (allNonLetters) + ")";
__result.add(new DiscreteFeature(__id + __value));
}
}
return __result;
}
use of edu.illinois.cs.cogcomp.edison.features.Feature in project cogcomp-nlp by CogComp.
the class Affixes method getFeatures.
@Override
public Set<Feature> getFeatures(Constituent c) throws EdisonException {
String classifier = "Affixes";
TextAnnotation ta = c.getTextAnnotation();
TOKENS = ta.getView(ViewNames.TOKENS);
Set<Feature> result = new LinkedHashSet<Feature>();
String id;
String value;
String word = c.getSurfaceForm();
for (int i = 3; i <= 4; ++i) {
if (word.length() > i) {
id = "p|";
value = "" + (word.substring(0, i));
result.add(new DiscreteFeature(classifier + ":" + id + "(" + value + ")"));
}
}
for (int i = 1; i <= 4; ++i) {
if (word.length() > i) {
id = "s|";
value = "" + (word.substring(word.length() - i));
result.add(new DiscreteFeature(classifier + ":" + id + "(" + value + ")"));
}
}
return result;
}
use of edu.illinois.cs.cogcomp.edison.features.Feature in project cogcomp-nlp by CogComp.
the class ChunkWindowThreeBefore method getFeatures.
@Override
public /**
* This feature extractor assumes that the TOKEN View and the SHALLOW_PARSE View have been
* generated in the Constituents TextAnnotation. It will generate discrete features from
* the chunk labels of the previous two tokens.
*
**/
Set<Feature> getFeatures(Constituent c) throws EdisonException {
String classifier = "ChunkWindowThreeBefore";
TextAnnotation ta = c.getTextAnnotation();
TOKENS = ta.getView(ViewNames.TOKENS);
SHALLOW_PARSE = ta.getView(ViewNames.SHALLOW_PARSE);
// We can assume that the constituent in this case is a Word(Token) described by the LBJ
// chunk definition
int startspan = c.getStartSpan();
int endspan = c.getEndSpan();
// All our constituents are words(tokens)
// two words before
int k = -2;
List<Constituent> wordstwobefore = getwordskfrom(TOKENS, startspan, endspan, k);
String[] labels = new String[2];
Set<Feature> result = new LinkedHashSet<Feature>();
int i = 0;
if (wordstwobefore.size() == 0) {
return result;
}
for (Constituent token : wordstwobefore) {
// Should only be one POS tag for each token
List<String> Chunk_label = SHALLOW_PARSE.getLabelsCoveringSpan(token.getStartSpan(), token.getEndSpan());
if (Chunk_label.size() != 1) {
logger.warn("Error token has more than one POS tag or Chunk Label.");
}
labels[i] = Chunk_label.get(0);
String __value = "(" + labels[i] + ")";
String __id = classifier + ":" + (i++);
result.add(new DiscreteFeature(__id + __value));
}
return result;
}
use of edu.illinois.cs.cogcomp.edison.features.Feature in project cogcomp-nlp by CogComp.
the class PreExtractor method countFeatures.
/**
* This is where actual feature extraction is taking place. The features are defined in the
* <b>features.fex</b> file and are read by {@link FeatureExtractor}
*
* @param x The predicate to extract features from.
* @throws EdisonException
*/
public void countFeatures(SenseInstance x) throws EdisonException {
ModelInfo modelInfo = manager.getModelInfo();
Set<Feature> feats = modelInfo.fex.getFeatures(x.getConstituent());
// This is the only place where a new feature can be added to the lexicon.
List<Integer> ids = new ArrayList<>();
List<Float> values = new ArrayList<>();
synchronized (lexicon) {
for (Feature f : feats) {
if (addNewFeatures) {
if (!lexicon.contains(f.getName())) {
lexicon.previewFeature(f.getName());
}
} else if (!lexicon.contains(f.getName())) {
continue;
}
int featureId = lexicon.lookupId(f.getName());
lexicon.countFeature(featureId);
ids.add(featureId);
values.add(f.getValue());
}
}
x.cacheFeatureVector(new FeatureVector(ArrayUtilities.asIntArray(ids), ArrayUtilities.asFloatArray(values)));
}
use of edu.illinois.cs.cogcomp.edison.features.Feature in project cogcomp-nlp by CogComp.
the class WordBigrams method getFeatures.
@Override
public Set<Feature> getFeatures(Constituent instance) throws EdisonException {
Set<Feature> features = new LinkedHashSet<>();
View tokens = instance.getTextAnnotation().getView(ViewNames.TOKENS);
List<Constituent> list = tokens.getConstituentsCoveringSpan(instance.getStartSpan(), instance.getEndSpan());
list.sort(TextAnnotationUtilities.constituentStartComparator);
ITransformer<Constituent, String> surfaceFormTransformer = new ITransformer<Constituent, String>() {
public String transform(Constituent input) {
return input.getSurfaceForm();
}
};
features.addAll(FeatureNGramUtility.getNgramsOrdered(list, 1, surfaceFormTransformer));
features.addAll(FeatureNGramUtility.getNgramsOrdered(list, 2, surfaceFormTransformer));
return features;
}
Aggregations