use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class Preprocess method runNER.
public TextAnnotation runNER(String s) {
TextAnnotationBuilder tab;
boolean splitOnHyphens = false;
tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHyphens, false));
TextAnnotation ta = tab.createTextAnnotation("001", "001", s);
try {
co.getView(ta);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return ta;
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ExampleUsage method SemEvalAnnotate.
public static void SemEvalAnnotate() {
String text = "People have been moving back into downtown.";
String corpus = "semeval";
String textId = "001";
// Create a TextAnnotation From Text
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
String modelPath = "";
FlatGazetteers gazetteers = null;
try {
ta.addView(pos_annotator);
chunker.addView(ta);
stanfordDepHandler.addView(ta);
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File model = ds.getDirectory("org.cogcomp.re", "SEMEVAL", 1.1, false);
modelPath = model.getPath();
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = (FlatGazetteers) GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
WordNetManager.loadConfigAsClasspathResource(true);
WordNetManager wordnet = WordNetManager.getInstance();
View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta);
for (Constituent co : ta.getView(ViewNames.TOKENS).getConstituents()) {
Constituent c = co.cloneForNewView("RE_ANNOTATED");
for (String s : co.getAttributeKeys()) {
c.addAttribute(s, co.getAttribute(s));
}
c.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordnet, c));
c.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordnet, c));
annotatedTokenView.addConstituent(c);
}
ta.addView("RE_ANNOTATED", annotatedTokenView);
} catch (Exception e) {
e.printStackTrace();
}
Constituent source = new Constituent("first", "Mention", ta, 0, 1);
Constituent target = new Constituent("second", "Mention", ta, 6, 7);
source.addAttribute("GAZ", gazetteers.annotatePhrase(source));
target.addAttribute("GAZ", gazetteers.annotatePhrase(target));
Relation relation = new Relation("TEST", source, target, 1.0f);
String prefix = modelPath + File.separator + "SEMEVAL" + File.separator + "SEMEVAL";
semeval_relation_classifier classifier = new semeval_relation_classifier(prefix + ".lc", prefix + ".lex");
String tag = classifier.discreteValue(relation);
System.out.println(tag);
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ExampleUsage method AnnotatorExample.
public static void AnnotatorExample() {
String text = "He went to Chicago after his Father moved there.";
String corpus = "story";
String textId = "001";
// Create a TextAnnotation From Text
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
RelationAnnotator relationAnnotator = new RelationAnnotator();
try {
ta.addView(pos_annotator);
chunker.addView(ta);
stanfordDepHandler.addView(ta);
relationAnnotator.addView(ta);
} catch (Exception e) {
e.printStackTrace();
}
View mentionView = ta.getView(ViewNames.MENTION);
List<Constituent> predictedMentions = mentionView.getConstituents();
List<Relation> predictedRelations = mentionView.getRelations();
for (Relation r : predictedRelations) {
IOHelper.printRelation(r);
}
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TemporalNormalizerBenchmark method testTemporalChunker.
/**
* Normalize the dataset using our Chunker for temporal phrases extraction
* @param outputFolder
* @param verbose
* @throws Exception
*/
public void testTemporalChunker(String outputFolder, boolean verbose) throws Exception {
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(false, false));
ResourceManager nerRm = new TemporalChunkerConfigurator().getDefaultConfig();
IOUtilities.existsInClasspath(TemporalChunkerAnnotator.class, nerRm.getString("modelDirPath"));
java.util.logging.Logger.getLogger("HeidelTimeStandalone").setLevel(Level.OFF);
List<TextAnnotation> taList = new ArrayList<>();
long preprocessTime = System.currentTimeMillis();
POSAnnotator annotator = new POSAnnotator();
for (int j = 0; j < testText.size(); j++) {
TextAnnotation ta = tab.createTextAnnotation("corpus", "id", testText.get(j));
try {
annotator.getView(ta);
} catch (AnnotatorException e) {
fail("AnnotatorException thrown!\n" + e.getMessage());
}
taList.add(ta);
}
if (verbose) {
System.out.println("Start");
}
long startTime = System.currentTimeMillis();
File outDir = new File(outputFolder);
if (!outDir.exists()) {
outDir.mkdir();
}
for (int j = 0; j < testText.size(); j++) {
tca.addDocumentCreationTime(DCTs.get(j));
TextAnnotation ta = taList.get(j);
try {
tca.addView(ta);
} catch (AnnotatorException e) {
fail("Exception while adding TIMEX3 VIEW " + e.getStackTrace());
}
String outputFileName = "./" + outputFolder + "/" + docIDs.get(j) + ".tml";
if (verbose) {
System.out.println(docIDs.get(j));
for (TimexChunk tc : tca.getTimex()) {
System.out.println(tc.toTIMEXString());
}
System.out.println("\n");
}
tca.write2Text(outputFileName, docIDs.get(j), testText.get(j));
tca.deleteTimex();
}
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
if (verbose) {
System.out.println("Process time: " + totalTime);
System.out.println("Preprocess + process time: " + (endTime - preprocessTime));
}
}
use of edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class TextAnnotationTest method testCharacterOffsetToTokenIndex.
/**
* test whether the mapping between character offset and token index is correct.
*/
@Test
public void testCharacterOffsetToTokenIndex() {
String normal = "The ordinary sample.\n\nDon't mess things up.";
String leadingWaste = "<ignoreme>wastedspace</ignoreme>";
String postWaste = " \n<ignoremetoo>aaaargh</ignoremetoo>";
String other = leadingWaste + normal + postWaste;
TextAnnotationBuilder tabldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation taNormal = tabldr.createTextAnnotation("test", "normal", normal);
List<Constituent> normalToks = taNormal.getView(ViewNames.TOKENS).getConstituents();
assertEquals(13, normalToks.get(2).getStartCharOffset());
assertEquals(24, normalToks.get(5).getStartCharOffset());
int ignoreUpToOffset = leadingWaste.length();
IntPair[] characterOffsets = new IntPair[10];
String[] tokens = taNormal.getTokens();
for (int i = 0; i < normalToks.size(); ++i) {
Constituent t = normalToks.get(i);
characterOffsets[i] = new IntPair(ignoreUpToOffset + t.getStartCharOffset(), ignoreUpToOffset + t.getEndCharOffset());
}
List<Constituent> sentences = taNormal.getView(ViewNames.SENTENCE).getConstituents();
int[] sentenceEndPositions = new int[sentences.size()];
for (int i = 0; i < sentences.size(); ++i) {
Constituent s = sentences.get(i);
sentenceEndPositions[i] = s.getEndSpan();
}
// all info should be same except initial char offsets of tokens ignore spans of text
TextAnnotation taOther = new TextAnnotation("test", "other", other, characterOffsets, tokens, sentenceEndPositions);
List<Constituent> otherToks = taOther.getView(ViewNames.TOKENS).getConstituents();
int thirdTokNormalStart = normalToks.get(2).getStartCharOffset();
int thirdTokOtherStart = otherToks.get(2).getStartCharOffset();
assertEquals(thirdTokOtherStart, (thirdTokNormalStart + leadingWaste.length()));
int eighthTokNormalStart = normalToks.get(8).getStartCharOffset();
int eighthTokOtherStart = otherToks.get(8).getStartCharOffset();
assertEquals(eighthTokOtherStart, (eighthTokNormalStart + leadingWaste.length()));
int meaninglessStartOffset = taOther.getTokenIdFromCharacterOffset(2);
assertEquals(-1, meaninglessStartOffset);
int meaninglessPastEndOffset = taOther.getTokenIdFromCharacterOffset(leadingWaste.length() + normal.length() + 5);
assertEquals(-1, meaninglessPastEndOffset);
int meaninglessInBetweenToksOffset = taNormal.getTokenIdFromCharacterOffset(20);
assertEquals(-1, meaninglessInBetweenToksOffset);
}
Aggregations