use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class WordsToTokens method convert.
/**
* Given a {@link LinkedVector} containing {@link Word}s, this method
* creates a new {@link LinkedVector} containing {@link Token}s.
*
* @param v A {@link LinkedVector} of {@link Word}s.
* @return A {@link LinkedVector} of {@link Token}s corresponding to the
* input {@link Word}s.
**/
public static LinkedVector convert(LinkedVector v) {
if (v == null)
return null;
if (v.size() == 0)
return v;
Word w = (Word) v.get(0);
Token t = new Token(w, null, null);
for (w = (Word) w.next; w != null; w = (Word) w.next) {
t.next = new Token(w, t, null);
t = (Token) t.next;
}
return new LinkedVector(t);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class Decoder method annotateBIO_AllLevelsWithTaggers.
/**
* use taggerLevel2=null if you want to use only one level of inference
*/
protected static void annotateBIO_AllLevelsWithTaggers(Data data, NETaggerLevel1 taggerLevel1, NETaggerLevel2 taggerLevel2) throws Exception {
clearPredictions(data);
NETaggerLevel1.isTraining = false;
NETaggerLevel2.isTraining = false;
GreedyDecoding.annotateGreedy(data, taggerLevel1, 1);
TextChunkRepresentationManager.changeChunkRepresentation(ParametersForLbjCode.currentParameters.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel1Tagger);
PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel1, NEWord.LabelToLookAt.PredictionLevel1Tagger);
// this block runs the level2 tagger
// Previously checked if features included 'PatternFeatures'
boolean level2 = ParametersForLbjCode.currentParameters.featuresToUse.containsKey("PredictionsLevel1");
if (taggerLevel2 != null && level2) {
// annotate with patterns
PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, 0.0, NEWord.LabelToLookAt.PredictionLevel1Tagger);
TwoLayerPredictionAggregationFeatures.setLevel1AggregationFeatures(data, false);
GreedyDecoding.annotateGreedy(data, taggerLevel2, 2);
PredictionsAndEntitiesConfidenceScores.pruneLowConfidencePredictions(data, ParametersForLbjCode.currentParameters.minConfidencePredictionsLevel2, NEWord.LabelToLookAt.PredictionLevel2Tagger);
TextChunkRepresentationManager.changeChunkRepresentation(ParametersForLbjCode.currentParameters.taggingEncodingScheme, TextChunkRepresentationManager.EncodingScheme.BIO, data, NEWord.LabelToLookAt.PredictionLevel2Tagger);
} else {
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector sentence : sentences) for (int i = 0; i < sentence.size(); i++) {
NEWord w = (NEWord) sentence.get(i);
w.neTypeLevel2 = w.neTypeLevel1;
}
}
}
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class AnnotatedDocument method init.
public void init() {
HashMap<String, ArrayList<String>> out = new HashMap<>();
StringBuffer res = new StringBuffer();
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (int i = 0; i < sentences.size(); i++) {
LinkedVector vector = sentences.get(i);
boolean open = false;
String[] predictions = new String[vector.size()];
String[] words = new String[vector.size()];
for (int j = 0; j < vector.size(); j++) {
predictions[j] = ((NEWord) vector.get(j)).neTypeLevel2;
words[j] = ((NEWord) vector.get(j)).form;
}
StringBuffer entity = null;
String tag = null;
for (int j = 0; j < vector.size(); j++) {
if (predictions[j].startsWith("B-") || (j > 0 && predictions[j].startsWith("I-") && (!predictions[j - 1].endsWith(predictions[j].substring(2))))) {
res.append("[").append(predictions[j].substring(2)).append(" ");
entity = new StringBuffer();
open = true;
tag = predictions[j].substring(2);
}
res.append(words[j]).append(" ");
if (open) {
entity.append(words[j]).append(" ");
boolean close = false;
if (j == vector.size() - 1) {
close = true;
} else {
if (predictions[j + 1].startsWith("B-"))
close = true;
if (predictions[j + 1].equals("O"))
close = true;
if (predictions[j + 1].indexOf('-') > -1 && (!predictions[j].endsWith(predictions[j + 1].substring(2))))
close = true;
}
if (close) {
String str_res = res.toString().trim();
res = new StringBuffer(str_res);
res.append("] ");
open = false;
if (out.containsKey(tag))
out.get(tag).add(entity.toString().trim());
else {
ArrayList<String> entities = new ArrayList<>();
entities.add(entity.toString().trim());
out.put(tag, entities);
}
}
}
}
}
}
taggedLine = res.toString();
labels = out;
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class BracketFileReader method parseTextWithBrackets.
public static NERDocument parseTextWithBrackets(String annotatedText, String docname) throws Exception {
if (annotatedText.replace(" ", "").replace("\n", "").replace("\t", "").length() == 0)
return new NERDocument(new ArrayList<LinkedVector>(), docname);
// can include newlines!!!!
Vector<String> bracketTokens = new Vector<>();
Vector<String> bracketTokensTags = new Vector<>();
parseBracketsAnnotatedText(annotatedText, bracketTokensTags, bracketTokens);
StringBuilder buff = new StringBuilder(bracketTokens.size() * 20);
for (int i = 0; i < bracketTokens.size(); i++) buff.append(bracketTokens.elementAt(i)).append(" ");
// the tokens below will have no newline characters.
// logger.info("Raw text: "+buff);
Vector<Vector<String>> parsedTokens = PlainTextReader.sentenceSplitAndTokenizeText(buff.toString());
// now we need to align the bracket tokens to the sentence split and tokenized tokens.
// there are two issues to be careful with -
// 1) The bracket tokens may have newline characters as individual tokens, the others will
// not
// 2) The tokenized/sentence split tokens may be bracket tokens broken into separate tokens.
Vector<String> parsedTokensFlat = new Vector<>();
for (int i = 0; i < parsedTokens.size(); i++) for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) parsedTokensFlat.addElement(parsedTokens.elementAt(i).elementAt(j));
// logger.info("----"+parsedTokensFlat.size());
// to be filled later
Vector<String> parsedTokensTagsFlat = new Vector<>();
StringBuilder bracketTokensText = new StringBuilder(bracketTokens.size() * 20);
StringBuilder parsedTokensText = new StringBuilder(parsedTokensFlat.size() * 20);
int bracketsTokensPos = 0;
int parsedTokensPos = 0;
while (bracketsTokensPos < bracketTokens.size()) {
while (bracketsTokensPos < bracketTokens.size() && bracketTokens.elementAt(bracketsTokensPos).equals("\n")) bracketsTokensPos++;
if (bracketsTokensPos < bracketTokens.size()) {
bracketTokensText.append(" ").append(bracketTokens.elementAt(bracketsTokensPos));
String currentLabel = bracketTokensTags.elementAt(bracketsTokensPos);
parsedTokensTagsFlat.addElement(currentLabel);
parsedTokensText.append(" ").append(parsedTokensFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
while ((!bracketTokensText.toString().equals(parsedTokensText.toString())) && parsedTokensPos < parsedTokensFlat.size()) {
if (currentLabel.startsWith("B-"))
parsedTokensTagsFlat.addElement("I-" + currentLabel.substring(2));
else
parsedTokensTagsFlat.addElement(currentLabel);
parsedTokensText.append(parsedTokensFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
}
if (!bracketTokensText.toString().equals(parsedTokensText.toString()))
throw new Exception("Error aligning raw brackets tokens to token/sentence split tokens\nBrackets token text till now:\n" + bracketTokensText + "\nTokenized text till now:\n" + parsedTokensText);
bracketsTokensPos++;
}
}
// ok, we're done, just building the output sentences
ArrayList<LinkedVector> res = new ArrayList<>();
parsedTokensPos = 0;
for (int i = 0; i < parsedTokens.size(); i++) {
LinkedVector sentence = new LinkedVector();
for (int j = 0; j < parsedTokens.elementAt(i).size(); j++) {
NEWord.addTokenToSentence(sentence, parsedTokensFlat.elementAt(parsedTokensPos), parsedTokensTagsFlat.elementAt(parsedTokensPos));
parsedTokensPos++;
}
res.add(sentence);
}
return new NERDocument(res, docname);
}
use of edu.illinois.cs.cogcomp.lbjava.parse.LinkedVector in project cogcomp-nlp by CogComp.
the class ExpressiveFeaturesAnnotator method annotate.
/**
* Do not worry about the brown clusters and word embeddings, this stuff is added on the fly in
* the .lbj feature generators...
*/
public static void annotate(Data data) throws Exception {
/*
* must be after the linkability has been initialized!!!
*/
if (ParametersForLbjCode.currentParameters.normalizeTitleText) {
// logger.info("Normalizing text case ...");
TitleTextNormalizer.normalizeCase(data);
}
if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("BrownClusterPaths")) {
// logger.info("Brown clusters OOV statistics:");
BrownClusters.get().printOovData(data);
}
if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("WordEmbeddings")) {
// logger.info("Word Embeddings OOV statistics:");
WordEmbeddings.printOovData(data);
}
// annotating with Gazetteers;
if (ParametersForLbjCode.currentParameters.featuresToUse != null) {
if (ParametersForLbjCode.currentParameters.featuresToUse.containsKey("GazetteersFeatures")) {
// first make sure the gazetteers arrays are inited for each word.
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector sentence : sentences) {
for (int j = 0; j < sentence.size(); j++) {
NEWord ww = (NEWord) sentence.get(j);
if (ww.gazetteers == null)
ww.gazetteers = new ArrayList<>();
}
}
}
Gazetteers gaz = GazetteersFactory.get();
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector vector : sentences) {
for (int j = 0; j < vector.size(); j++) gaz.annotate((NEWord) vector.get(j));
}
}
// sort the gazetteers.
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector vector : sentences) {
for (int j = 0; j < vector.size(); j++) Collections.sort(((NEWord) vector.get(j)).gazetteers);
}
}
}
}
// annotating the nonlocal features;
for (int docid = 0; docid < data.documents.size(); docid++) {
ArrayList<LinkedVector> sentences = data.documents.get(docid).sentences;
for (LinkedVector vector : sentences) {
for (int j = 0; j < vector.size(); j++) ContextAggregation.annotate((NEWord) vector.get(j));
}
}
/*
* Note that this piece of code must be the last!!! Here we are adding as features the
* predictions of the aux models
*/
for (int i = 0; i < ParametersForLbjCode.currentParameters.auxiliaryModels.size(); i++) {
ParametersForLbjCode currentModel = ParametersForLbjCode.currentParameters;
ParametersForLbjCode.currentParameters = ParametersForLbjCode.currentParameters.auxiliaryModels.elementAt(i);
Decoder.annotateDataBIO(data, (NETaggerLevel1) ParametersForLbjCode.currentParameters.taggerLevel1, (NETaggerLevel2) ParametersForLbjCode.currentParameters.taggerLevel2);
Vector<Data> v = new Vector<>();
v.addElement(data);
NETesterMultiDataset.printAllTestResultsAsOneDataset(v, false);
TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, data, NEWord.LabelToLookAt.PredictionLevel1Tagger);
TextChunkRepresentationManager.changeChunkRepresentation(TextChunkRepresentationManager.EncodingScheme.BIO, TextChunkRepresentationManager.EncodingScheme.BILOU, data, NEWord.LabelToLookAt.PredictionLevel2Tagger);
// addAuxiliaryClassifierFeatures(data, "aux_model_" + i);
ParametersForLbjCode.currentParameters = currentModel;
}
}
Aggregations