use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class TwoLayerPredictionAggregationFeatures method setLevel1AggregationFeatures.
/*
* If our confidence in predicting the named entity is higher than minConfidenceThreshold, we're
* going to use the predictions as features
*/
private static void setLevel1AggregationFeatures(NEWord word, boolean useGoldData) {
ParametersForLbjCode parameters = word.params;
// this used to be hard-coded to 0.1
double omissionRate = parameters.omissionRate;
// this used to be hard-coded to 0.2 for right direction and 0.1 for left
// now this is approximated by halving the rate set in the properties
double noiseRate = parameters.randomNoiseLevel;
String wordForm = word.form;
String wordFormLC = wordForm.toLowerCase();
word.resetLevel1AggregationFeatures();
NamedEntity currentNE = word.predictedEntity;
// these counters will keep the distribution of the features around the current word
OccurrenceCounter featuresCounts = new OccurrenceCounter();
if (useGoldData)
currentNE = word.goldEntity;
HashMap<NamedEntity, Boolean> confidentEntitiesInTheArea = new HashMap<>();
HashMap<NamedEntity, Boolean> confidentEntitiesInTheAreaLeft = new HashMap<>();
HashMap<NamedEntity, Boolean> confidentEntitiesInTheAreaRight = new HashMap<>();
NEWord w = word.previousIgnoreSentenceBoundary;
for (int i = 0; i < 1000 && w != null; i++) {
if (useGoldData && w.goldEntity != null && (!w.goldEntity.equals(currentNE))) {
confidentEntitiesInTheArea.put(w.goldEntity, true);
confidentEntitiesInTheAreaLeft.put(w.goldEntity, true);
}
if (w.predictedEntity != null && (!w.predictedEntity.equals(currentNE)) && !useGoldData) {
confidentEntitiesInTheArea.put(w.predictedEntity, true);
confidentEntitiesInTheAreaLeft.put(w.predictedEntity, true);
}
if (w != word && w.form.equals(wordForm)) {
if (useGoldData) {
// we're typically better with entities to the left....
if (parameters.level1AggregationRandomGenerator.nextDouble() < (noiseRate / 2))
featuresCounts.addToken("leftTokenLevel" + parameters.level1AggregationRandomGenerator.randomLabel());
else
featuresCounts.addToken("leftTokenLevel" + w.neLabel);
} else {
featuresCounts.addToken("leftTokenLevel" + w.neTypeLevel1);
}
}
w = w.previousIgnoreSentenceBoundary;
}
w = word.nextIgnoreSentenceBoundary;
for (int i = 0; i < 1000 && w != null; i++) {
if (useGoldData && w.goldEntity != null && (!w.goldEntity.equals(currentNE))) {
confidentEntitiesInTheArea.put(w.goldEntity, true);
confidentEntitiesInTheAreaRight.put(w.goldEntity, true);
}
if (w.predictedEntity != null && (!w.predictedEntity.equals(currentNE)) && !useGoldData) {
confidentEntitiesInTheArea.put(w.predictedEntity, true);
confidentEntitiesInTheAreaRight.put(w.predictedEntity, true);
}
if (w != word && w.form.equals(wordForm)) {
if (useGoldData) {
if (parameters.level1AggregationRandomGenerator.nextDouble() < noiseRate)
featuresCounts.addToken("rightTokenLevel" + parameters.level1AggregationRandomGenerator.randomLabel());
else
featuresCounts.addToken("rightTokenLevel" + w.neLabel);
} else {
featuresCounts.addToken("rightTokenLevel" + w.neTypeLevel1);
}
}
w = w.nextIgnoreSentenceBoundary;
}
for (NamedEntity ne : confidentEntitiesInTheArea.keySet()) {
String neForm = ne.form;
String neFormLC = neForm.toLowerCase();
// check if we should just omit this NE
if (parameters.level1AggregationRandomGenerator.nextDouble() > omissionRate) {
// this is if the direction is right. If the direction is left- we have to modify
// this
String direction = Direction.RIGHT.toString();
// please be careful with updating the direction values
if (confidentEntitiesInTheAreaLeft.containsKey(ne)) {
direction = Direction.LEFT.toString();
// we're typically better with entities to the left....
noiseRate = omissionRate / 2;
}
String neType = ne.type;
if (parameters.level1AggregationRandomGenerator.nextDouble() < noiseRate) {
String randomLabelType = parameters.level1AggregationRandomGenerator.randomType();
while (randomLabelType.equalsIgnoreCase("O") || randomLabelType.equals(neType)) randomLabelType = parameters.level1AggregationRandomGenerator.randomType();
neType = randomLabelType;
}
if ((!confidentEntitiesInTheAreaLeft.containsKey(ne)) && (!confidentEntitiesInTheAreaRight.containsKey(ne)))
throw new IllegalArgumentException("Fatal error: the NE is neither on the left or the right?!");
boolean neEqWord = neForm.equals(wordForm);
boolean neEqWordLC = neFormLC.equals(wordFormLC);
boolean neStartsWithWord = neForm.startsWith(wordForm);
boolean neStartsWithWordLC = neFormLC.startsWith(wordFormLC);
boolean neEndsWithWord = neForm.endsWith(wordForm);
boolean neEndsWithWordLC = neFormLC.endsWith(wordFormLC);
boolean neContainsWord = neForm.contains(wordForm);
if (currentNE != null) {
String curNEForm = currentNE.form;
String curNEFormLC = curNEForm.toLowerCase();
if (curNEForm.length() > 3) {
boolean neEqCurNE = neForm.equals(curNEForm);
boolean neEqCurNELC = neFormLC.equals(curNEFormLC);
boolean neStartsWithCurNE = neForm.startsWith(curNEForm);
boolean neStartsWithCurNELC = neFormLC.startsWith(curNEFormLC);
boolean neEndsWithCurNE = neForm.endsWith(curNEForm);
boolean neEndsWithCurNELC = neFormLC.endsWith(curNEFormLC);
boolean neContainsCurNE = neForm.contains(curNEForm);
if (neEqCurNE)
featuresCounts.addToken(direction + "NE_Also_Exact_Match_NE_Type:\t" + neType);
if ((!neEqCurNE) && (!neStartsWithCurNE) && (!neEndsWithCurNE) && neContainsCurNE)
featuresCounts.addToken(direction + "NE_Also_Substring_In_NE_Type:\t" + neType);
if ((!neEqCurNE) && neStartsWithCurNE)
featuresCounts.addToken(direction + "NE_Also_Starts_NE_Type:\t" + neType);
if ((!neEqCurNE) && neEndsWithCurNE)
featuresCounts.addToken(direction + "NE_Also_Ends_NE_Type:\t" + neType);
if ((!neEqCurNE) && neEqCurNELC)
featuresCounts.addToken(direction + "NE_Also_Exact_Match_NE_Type_IC:\t" + neType);
if ((!((!neEqCurNE) && (!neStartsWithCurNE) && (!neEndsWithCurNE) && neContainsCurNE)) && ((!neEqCurNELC) && (!neStartsWithCurNELC) && (!neEndsWithCurNELC) && neFormLC.contains(curNEFormLC)))
featuresCounts.addToken(direction + "NE_Also_Substring_In_NE_Type_IC:\t" + neType);
if ((!((!neEqCurNE) && neStartsWithCurNE)) && (!neEqCurNELC) && neStartsWithCurNELC)
featuresCounts.addToken(direction + "NE_Also_Starts_NE_Type_IC:\t" + neType);
if ((!((!neEqCurNE) && neEndsWithCurNE)) && (!neEqCurNELC) && neEndsWithCurNELC)
featuresCounts.addToken(direction + "NE_Also_Ends_NE_Type_IC:\t" + neType);
}
// able to say something about the word "Bank"
if (wordForm.length() > 3) {
if (neEqWord)
featuresCounts.addToken(direction + "labeledTokenExactMatchInExpression:\t" + neType);
if ((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)
featuresCounts.addToken(direction + "labeledTokenSubstringInExpression:\t" + neType);
if ((!neEqWord) && neStartsWithWord)
featuresCounts.addToken(direction + "labeledTokenStartsExpression:\t" + neType);
if ((!neEqWord) && neEndsWithWord)
featuresCounts.addToken(direction + "unlabeledTokenEndsExpression:\t" + neType);
if ((!neEqWord) && neEqWordLC)
featuresCounts.addToken(direction + "labeledTokenExactMatchInExpression_IC:\t" + neType);
if ((!((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)) && ((!neEqWordLC) && (!neStartsWithWordLC) && (!neEndsWithWordLC) && neFormLC.contains(wordFormLC)))
featuresCounts.addToken(direction + "labeledTokenSubstringInExpression_IC:\t" + neType);
if ((!((!neEqWord) && neStartsWithWord)) && ((!neEqWordLC) && neStartsWithWordLC))
featuresCounts.addToken(direction + "labeledTokenStartsExpression_IC:\t" + neType);
if ((!((!neEqWord) && neEndsWithWord)) && ((!neEqWordLC) && neEndsWithWordLC))
featuresCounts.addToken(direction + "labeledTokenEndsExpression_IC:\t" + neType);
}
} else {
// this form is not a part of named entity
if (wordForm.length() > 3) {
if (neEqWord)
featuresCounts.addToken(direction + "unlabeledTokenExactMatchInExpression:\t" + neType);
if ((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)
featuresCounts.addToken(direction + "unlabeledTokenSubstringInExpression:\t" + neType);
if ((!neEqWord) && neStartsWithWord)
featuresCounts.addToken(direction + "unlabeledTokenStartsExpression:\t" + neType);
if ((!neEqWord) && neEndsWithWord)
featuresCounts.addToken(direction + "unlabeledTokenEndsExpression:\t" + neType);
if ((!neEqWord) && neEqWordLC)
featuresCounts.addToken(direction + "unlabeledTokenExactMatchInExpression_IC:\t" + neType);
if ((!((!neEqWord) && (!neStartsWithWord) && (!neEndsWithWord) && neContainsWord)) && ((!neEqWordLC) && (!neStartsWithWordLC) && (!neEndsWithWordLC) && neFormLC.contains(wordFormLC)))
featuresCounts.addToken(direction + "unlabeledTokenSubstringInExpression_IC:\t" + neType);
if ((!((!neEqWord) && neStartsWithWord)) && ((!neEqWordLC) && neStartsWithWordLC))
featuresCounts.addToken(direction + "unlabeledTokenStartsExpression_IC:\t" + neType);
if ((!((!neEqWord) && neEndsWithWord)) && ((!neEqWordLC) && neEndsWithWordLC))
featuresCounts.addToken(direction + "unlabeledTokenEndsExpression_IC:\t" + neType);
}
}
}
}
double max = -1;
for (Iterator<String> i = featuresCounts.getTokensIterator(); i.hasNext(); ) {
String s = i.next();
if (max < featuresCounts.getCount(s))
max = featuresCounts.getCount(s);
}
if (max == 0)
max = 1;
ArrayList<NEWord.RealFeature> newag = word.resetLevel1AggregationFeatures();
for (Iterator<String> i = featuresCounts.getTokensIterator(); i.hasNext(); ) {
String s = i.next();
newag.add(new NEWord.RealFeature(featuresCounts.getCount(s) / max, s));
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class Decoder method annotateBIO_AllLevelsWithTaggers.
/**
* use taggerLevel2=null if you want to use only one level of inference
*/
protected static void annotateBIO_AllLevelsWithTaggers(Data data, ParametersForLbjCode params) throws Exception {
clearPredictions(data);
NETaggerLevel1.isTraining = false;
NETaggerLevel2.isTraining = false;
GreedyDecoding.annotateGreedy(data, params.taggerLevel1, 1);
TextChunkRepresentationManager.changeChunkRepresentation(params.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel1Tagger);
PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, params.minConfidencePredictionsLevel1, NEWord.LabelToLookAt.PredictionLevel1Tagger);
// this block runs the level2 tagger
// Previously checked if features included 'PatternFeatures'
boolean level2 = params.featuresToUse.containsKey("PredictionsLevel1");
if (params.taggerLevel2 != null && level2) {
// annotate with patterns
PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, 0.0, NEWord.LabelToLookAt.PredictionLevel1Tagger);
TwoLayerPredictionAggregationFeatures.setLevel1AggregationFeatures(data, false);
GreedyDecoding.annotateGreedy(data, params.taggerLevel2, 2);
PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, params.minConfidencePredictionsLevel2, NEWord.LabelToLookAt.PredictionLevel2Tagger);
TextChunkRepresentationManager.changeChunkRepresentation(params.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel2Tagger);
} else {
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector sentence : sentences) for (int i = 0; i < sentence.size(); i++) {
NEWord w = (NEWord) sentence.get(i);
w.neTypeLevel2 = w.neTypeLevel1;
}
}
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class ContextAggregation method annotate.
/*
* Make sure to call this function as a last possible function: this function already assumes
* that the data was annotated with dictionaries etc.
*/
public static void annotate(NEWord word) {
if (word.params.featuresToUse.containsKey("aggregateContext") || word.params.featuresToUse.containsKey("aggregateGazetteerMatches")) {
int i = 0;
NEWord w = word, last = word.nextIgnoreSentenceBoundary;
Hashtable<NEWord, Boolean> takenWords = new Hashtable<>();
takenWords.put(word, true);
NEWord temp = word.nextIgnoreSentenceBoundary;
int k = 0;
while (temp != null && k < 3) {
takenWords.put(temp, true);
temp = temp.nextIgnoreSentenceBoundary;
k++;
}
temp = word.previousIgnoreSentenceBoundary;
k = 0;
while (temp != null && k < 3) {
takenWords.put(temp, true);
temp = temp.previousIgnoreSentenceBoundary;
k++;
}
for (i = 0; i < 200 && last != null; ++i) last = last.nextIgnoreSentenceBoundary;
for (i = 0; i > -200 && w.previousIgnoreSentenceBoundary != null; --i) w = w.previousIgnoreSentenceBoundary;
do {
if (w.form.equalsIgnoreCase(word.form) && Character.isUpperCase(word.form.charAt(0)) && Character.isLowerCase(w.form.charAt(0)))
updateFeatureCounts(word, "appearsDownCased");
if (w.form.equalsIgnoreCase(word.form) && Character.isUpperCase(w.form.charAt(0)) && Character.isUpperCase(word.form.charAt(0)) && word != w) {
if (word.params.featuresToUse.containsKey("aggregateContext")) {
if (w.previous == null)
updateFeatureCounts(word, "appearancesUpperStartSentence");
if (w.previous != null)
if (((NEWord) w.previous).form.endsWith("."))
updateFeatureCounts(word, "appearancesUpperStartSentence");
if (w.previous != null && (!((NEWord) w.previous).form.endsWith(".")))
updateFeatureCounts(word, "appearancesUpperMiddleSentence");
NEWord wtemp = w, lastTemp = w.nextIgnoreSentenceBoundary;
int j = 0;
for (j = 0; j < 2 && lastTemp != null; ++j) lastTemp = lastTemp.nextIgnoreSentenceBoundary;
for (j = 0; j > -2 && wtemp.previousIgnoreSentenceBoundary != null; --j) wtemp = wtemp.previousIgnoreSentenceBoundary;
do {
updateFeatureCounts(word, "context:" + j + ":" + wtemp.form);
if (word.params.brownClusters.getResources() != null) {
String[] brownPaths = word.params.brownClusters.getPrefixes(wtemp);
// updateFeatureCounts(word,"contextPath:"+j+":"+brownPaths[k]);
if (brownPaths.length > 0)
updateFeatureCounts(word, "contextPath:" + j + ":" + brownPaths[0]);
}
wtemp = wtemp.nextIgnoreSentenceBoundary;
j++;
} while (wtemp != lastTemp);
}
}
w = w.nextIgnoreSentenceBoundary;
} while (w != last);
}
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class NERAnnotator method addView.
/**
* Generate the view representing the list of extracted entities and adds it the
* {@link TextAnnotation}.
*/
@Override
public void addView(TextAnnotation ta) {
// convert this data structure into one the NER package can deal with.
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();
int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
String[] wtoks = sentence.getTokens();
LinkedVector words = new LinkedVector();
for (String w : wtoks) {
if (w.length() > 0) {
NEWord.addTokenToSentence(words, w, "unlabeled", this.params);
tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
logger.error("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
// Do the annotation.
Data data = new Data(new NERDocument(sentences, "input"));
try {
ExpressiveFeaturesAnnotator.annotate(data, this.params);
Decoder.annotateDataBIO(data, params);
} catch (Exception e) {
logger.error("Cannot annotate the text, the exception was: ", e);
return;
}
// now we have the parsed entities, construct the view object.
ArrayList<LinkedVector> nerSentences = data.documents.get(0).sentences;
SpanLabelView nerView = new SpanLabelView(getViewName(), ta);
// the data always has a single document
// each LinkedVector in data corresponds to a sentence.
int tokenoffset = 0;
for (LinkedVector vector : nerSentences) {
boolean open = false;
// there should be a 1:1 mapping btw sentence tokens in record and words/predictions
// from NER.
int startIndex = -1;
String label = null;
for (int j = 0; j < vector.size(); j++, tokenoffset++) {
NEWord neWord = (NEWord) (vector.get(j));
String prediction = neWord.neTypeLevel2;
// inefficient, use enums, or nominalized indexes for this sort of thing.
if (prediction.startsWith("B-")) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
} else if (j > 0) {
String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2;
if (prediction.startsWith("I-") && (!previous_prediction.endsWith(prediction.substring(2)))) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
}
}
if (open) {
boolean close = false;
if (j == vector.size() - 1) {
close = true;
} else {
String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2;
if (next_prediction.startsWith("B-"))
close = true;
if (next_prediction.equals("O"))
close = true;
if (next_prediction.indexOf('-') > -1 && (!prediction.endsWith(next_prediction.substring(2))))
close = true;
}
if (close) {
int s = tokenindices[startIndex];
/*
* MS: fixed bug. Originally, e was set using tokenindices[tokenoffset], but
* tokenoffset can reach tokens.length) and this exceeds array length.
* Constituent constructor requires one-past-the-end token indexing,
* requiring e > s. Hence the complicated setting of endIndex/e below.
*/
int endIndex = Math.min(tokenoffset + 1, tokens.length - 1);
int e = tokenindices[endIndex];
if (e <= s)
e = s + 1;
nerView.addSpanLabel(s, e, label, 1d);
open = false;
}
}
}
}
ta.addView(viewName, nerView);
}
use of edu.illinois.cs.cogcomp.ner.LbjTagger.NEWord in project cogcomp-nlp by CogComp.
the class PlainTextReader method showSentenceVector.
public static String showSentenceVector(Vector<LinkedVector> sentences) {
String display = "";
for (LinkedVector v : sentences) {
for (int i = 0; i < v.size(); ++i) {
NEWord s = (NEWord) (v.get(i));
display += (s.toString());
}
}
return display;
}
Aggregations