use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView in project cogcomp-nlp by CogComp.
the class CurrencyIndicator method addCurrencyView.
private void addCurrencyView(TextAnnotation ta) throws Exception {
if (!loaded)
synchronized (this) {
// now its changed to be loaded from datastore.
if (!loaded)
loadCurrency(gzip, true);
}
synchronized (ta) {
if (ta.hasView(VIEW_NAME))
return;
List<String> tokens = new ArrayList<>();
Collections.addAll(tokens, ta.getTokens());
List<IntPair> matches = new ArrayList<>();
for (String pattern : currencies) {
List<IntPair> list = ta.getSpansMatching(pattern);
matches.addAll(list);
}
SpanLabelView view = new SpanLabelView(VIEW_NAME, "Gazetteer", ta, 1.0);
Set<IntPair> added = new LinkedHashSet<>();
for (IntPair p : matches) {
// don't add nested constituents of the same type
boolean foundContainer = false;
for (IntPair p1 : added) {
if (p1 == p)
continue;
if (p1.getFirst() <= p.getFirst() && p1.getSecond() >= p.getSecond()) {
foundContainer = true;
break;
}
}
if (!foundContainer) {
view.addSpanLabel(p.getFirst(), p.getSecond(), "CURRENCY", 1.0);
added.add(p);
}
}
ta.addView(VIEW_NAME, view);
}
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView in project cogcomp-nlp by CogComp.
the class SimpleGazetteerAnnotatorTest method testAddView.
/**
* Test method for
* {@link edu.illinois.cs.cogcomp.edison.annotators.SimpleGazetteerAnnotator#addView(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)}
* .
*
* @throws URISyntaxException
* @throws IOException
* @throws AnnotatorException
*/
@Test
public void testAddView() throws IOException, URISyntaxException, AnnotatorException {
SimpleGazetteerAnnotator sga = new SimpleGazetteerAnnotator(defaultRm);
assertTrue("Wrong number of dictionaries loaded.", sga.dictionaries.size() == 1);
assertTrue("Wrong number of dictionaries loaded.", sga.dictionariesIgnoreCase.size() == 1);
TextAnnotation ta = tab.createTextAnnotation("I hail from the university of illinois at champaign urbana.");
sga.addView(ta);
SpanLabelView view = (SpanLabelView) ta.getView(ViewNames.TREE_GAZETTEER);
List<Constituent> entities = view.getConstituents();
Constituent c1 = entities.get(0);
assertEquals(c1.toString(), "university of illinois");
Constituent c2 = entities.get(1);
assertEquals(c2.toString(), "university of illinois at champaign urbana");
Constituent c3 = entities.get(2);
assertEquals(c3.toString(), "illinois");
Constituent c4 = entities.get(3);
assertEquals(c4.toString(), "champaign");
Constituent c5 = entities.get(4);
assertEquals(c5.toString(), "urbana");
assertEquals(c1.getLabel(), "organizations(IC)");
assertEquals(c2.getLabel(), "organizations(IC)");
assertEquals(c3.getLabel(), "places(IC)");
assertEquals(c4.getLabel(), "places(IC)");
assertEquals(c5.getLabel(), "places(IC)");
Properties props = new Properties();
props.setProperty(SimpleGazetteerAnnotatorConfigurator.PHRASE_LENGTH.key, "4");
props.setProperty(SimpleGazetteerAnnotatorConfigurator.PATH_TO_DICTIONARIES.key, "/testgazetteers/");
props.setProperty(SimpleGazetteerAnnotatorConfigurator.IS_LAZILY_INITIALIZED.key, SimpleGazetteerAnnotatorConfigurator.FALSE);
sga = new SimpleGazetteerAnnotator(new ResourceManager(props));
assertTrue("Wrong number of dictionaries loaded.", sga.dictionaries.size() == 1);
assertTrue("Wrong number of dictionaries loaded.", sga.dictionariesIgnoreCase.size() == 1);
ta = tab.createTextAnnotation("I hail from the university of illinois at champaign urbana.");
sga.addView(ta);
view = (SpanLabelView) ta.getView(ViewNames.TREE_GAZETTEER);
entities = view.getConstituents();
c1 = entities.get(0);
assertEquals(c1.toString(), "university of illinois");
c2 = entities.get(1);
assertEquals(c2.toString(), "illinois");
c3 = entities.get(2);
assertEquals(c3.toString(), "champaign");
c4 = entities.get(3);
assertEquals(c4.toString(), "urbana");
assertEquals(c1.getLabel(), "organizations(IC)");
assertEquals(c2.getLabel(), "places(IC)");
assertEquals(c3.getLabel(), "places(IC)");
assertEquals(c4.getLabel(), "places(IC)");
ta = tab.createTextAnnotation("I hail from the University of Illinois at champaign urbana.");
sga.addView(ta);
view = (SpanLabelView) ta.getView(ViewNames.TREE_GAZETTEER);
entities = view.getConstituents();
c1 = entities.get(0);
assertEquals(c1.toString(), "University of Illinois");
assertEquals(c1.getLabel(), "organizations");
c2 = entities.get(1);
assertEquals(c1.toString(), "University of Illinois");
assertEquals(c1.getLabel(), "organizations");
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView in project cogcomp-nlp by CogComp.
the class TestBrownClusterViewGenerator method testCharniakParseViewGenerator.
/**
* Test the configuration of normalizing tokens in the brown clusters
*/
@Test
public final void testCharniakParseViewGenerator() {
String sentence = "a test .";
TextAnnotation ta = TextAnnotationUtilities.createFromTokenizedString(sentence);
// The default configuration: do normalization
BrownClusterViewGenerator viewGenerator = null;
try {
viewGenerator = new BrownClusterViewGenerator(BrownClusterViewGenerator.file100, BrownClusterViewGenerator.file100);
viewGenerator.addView(ta);
} catch (Exception e) {
e.printStackTrace();
}
SpanLabelView view = (SpanLabelView) ta.getView(viewGenerator.getViewName());
assertEquals("a", view.getConstituents().get(0).getSurfaceForm());
assertEquals("111011111", view.getConstituents().get(0).getLabel());
assertEquals("a", view.getConstituents().get(1).getSurfaceForm());
assertEquals("10010", view.getConstituents().get(1).getLabel());
assertEquals("test", view.getConstituents().get(2).getSurfaceForm());
assertEquals("001110", view.getConstituents().get(2).getLabel());
// Don't normalize tokens in the brown clusters
Properties props = new Properties();
props.setProperty(BrownClusterViewGeneratorConfigurator.NORMALIZE_TOKEN.key, Configurator.FALSE);
ResourceManager rm = new ResourceManager(props);
try {
viewGenerator = new BrownClusterViewGenerator(BrownClusterViewGenerator.file100, BrownClusterViewGenerator.file100, rm);
viewGenerator.addView(ta);
} catch (Exception e) {
e.printStackTrace();
}
view = (SpanLabelView) ta.getView(viewGenerator.getViewName());
assertEquals("a", view.getConstituents().get(0).getSurfaceForm());
assertEquals("10010", view.getConstituents().get(0).getLabel());
assertEquals("test", view.getConstituents().get(1).getSurfaceForm());
assertEquals("001110", view.getConstituents().get(1).getLabel());
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView in project cogcomp-nlp by CogComp.
the class CommaTest method setUp.
@Override
public void setUp() throws Exception {
String[] tokens = "Says Gayle Key , a mathematics teacher , `` Hello world . ''".split("\\s+");
TextAnnotation ta = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(Collections.singletonList(tokens));
TokenLabelView tlv = new TokenLabelView(ViewNames.POS, "Test", ta, 1.0);
tlv.addTokenLabel(0, "VBZ", 1d);
tlv.addTokenLabel(1, "NNP", 1d);
tlv.addTokenLabel(2, "NNP", 1d);
tlv.addTokenLabel(3, ",", 1d);
tlv.addTokenLabel(4, "DT", 1d);
tlv.addTokenLabel(5, "NN", 1d);
tlv.addTokenLabel(6, "NN", 1d);
tlv.addTokenLabel(7, ",", 1d);
tlv.addTokenLabel(8, "``", 1d);
tlv.addTokenLabel(9, "UH", 1d);
tlv.addTokenLabel(10, "NN", 1d);
tlv.addTokenLabel(11, ".", 1d);
tlv.addTokenLabel(12, "''", 1d);
TreeView parse = new TreeView(ViewNames.PARSE_STANFORD, "Test", ta, 1.0);
String treeString = "(ROOT" + " (SINV" + " (VP (VBZ Says))" + " (NP (NNP Gayle) (NNP Key))" + " (, ,)" + " (S" + " (NP (DT a) (NNS mathematics))" + " (VP (VBZ teacher) (, ,) (`` ``)" + " (NP" + " (INTJ (UH Hello))" + " (NP (NN world)))))" + " (. .) ('' '')))";
parse.setParseTree(0, TreeParserFactory.getStringTreeParser().parse(treeString));
SpanLabelView ner = new SpanLabelView(ViewNames.NER_CONLL, "Test", ta, 1.0);
ner.addSpanLabel(1, 3, "PER", 1.0);
SpanLabelView shallowParse = new SpanLabelView(ViewNames.SHALLOW_PARSE, "Test", ta, 1.0);
shallowParse.addSpanLabel(0, 3, "NP", 1.0);
shallowParse.addSpanLabel(4, 7, "NP", 1.0);
shallowParse.addSpanLabel(9, 11, "NP", 1.0);
// TODO dependency parse
// TODO SRL view
ta.addView(tlv.getViewName(), tlv);
ta.addView(parse.getViewName(), parse);
ta.addView(ner.getViewName(), ner);
ta.addView(shallowParse.getViewName(), shallowParse);
List<String> firstCommasRefinedLabels = Collections.singletonList("Substitute");
List<String> secondCommasRefinedLabels = Arrays.asList("Substitute", "Quotation");
CommaSRLSentence sentence = new CommaSRLSentence(ta, null, Arrays.asList(firstCommasRefinedLabels, secondCommasRefinedLabels));
List<Comma> sentenceCommas = sentence.getCommas();
commas = sentenceCommas.toArray(new Comma[sentenceCommas.size()]);
}
use of edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView in project cogcomp-nlp by CogComp.
the class NERAnnotator method addView.
/**
* Generate the view representing the list of extracted entities and adds it the
* {@link TextAnnotation}.
*/
@Override
public void addView(TextAnnotation ta) {
// convert this data structure into one the NER package can deal with.
ArrayList<LinkedVector> sentences = new ArrayList<>();
String[] tokens = ta.getTokens();
int[] tokenindices = new int[tokens.length];
int tokenIndex = 0;
int neWordIndex = 0;
for (int i = 0; i < ta.getNumberOfSentences(); i++) {
Sentence sentence = ta.getSentence(i);
String[] wtoks = sentence.getTokens();
LinkedVector words = new LinkedVector();
for (String w : wtoks) {
if (w.length() > 0) {
NEWord.addTokenToSentence(words, w, "unlabeled");
tokenindices[neWordIndex] = tokenIndex;
neWordIndex++;
} else {
logger.error("Bad (zero length) token.");
}
tokenIndex++;
}
if (words.size() > 0)
sentences.add(words);
}
// Do the annotation.
Data data = new Data(new NERDocument(sentences, "input"));
try {
ExpressiveFeaturesAnnotator.annotate(data);
Decoder.annotateDataBIO(data, t1, t2);
} catch (Exception e) {
logger.error("Cannot annotate the text, the exception was: ", e);
return;
}
// now we have the parsed entities, construct the view object.
ArrayList<LinkedVector> nerSentences = data.documents.get(0).sentences;
SpanLabelView nerView = new SpanLabelView(getViewName(), ta);
// the data always has a single document
// each LinkedVector in data corresponds to a sentence.
int tokenoffset = 0;
for (LinkedVector vector : nerSentences) {
boolean open = false;
// there should be a 1:1 mapping btw sentence tokens in record and words/predictions
// from NER.
int startIndex = -1;
String label = null;
for (int j = 0; j < vector.size(); j++, tokenoffset++) {
NEWord neWord = (NEWord) (vector.get(j));
String prediction = neWord.neTypeLevel2;
// inefficient, use enums, or nominalized indexes for this sort of thing.
if (prediction.startsWith("B-")) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
} else if (j > 0) {
String previous_prediction = ((NEWord) vector.get(j - 1)).neTypeLevel2;
if (prediction.startsWith("I-") && (!previous_prediction.endsWith(prediction.substring(2)))) {
startIndex = tokenoffset;
label = prediction.substring(2);
open = true;
}
}
if (open) {
boolean close = false;
if (j == vector.size() - 1) {
close = true;
} else {
String next_prediction = ((NEWord) vector.get(j + 1)).neTypeLevel2;
if (next_prediction.startsWith("B-"))
close = true;
if (next_prediction.equals("O"))
close = true;
if (next_prediction.indexOf('-') > -1 && (!prediction.endsWith(next_prediction.substring(2))))
close = true;
}
if (close) {
int s = tokenindices[startIndex];
/**
* MS: fixed bug. Originally, e was set using tokenindices[tokenoffset], but
* tokenoffset can reach tokens.length) and this exceeds array length.
* Constituent constructor requires one-past-the-end token indexing,
* requiring e > s. Hence the complicated setting of endIndex/e below.
*/
int endIndex = Math.min(tokenoffset + 1, tokens.length - 1);
int e = tokenindices[endIndex];
if (e <= s)
e = s + 1;
nerView.addSpanLabel(s, e, label, 1d);
open = false;
}
}
}
}
ta.addView(viewName, nerView);
}
Aggregations