use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ConvertOntonotesToColumn method getNameTextAnnotation.
/**
* read the file indicated by the argument which is the file name, and path.
* @param file the file to read.
* @param document the data read from the file.
* @return the XmlTextAnnotation containing the text annotation, and xml markup offset data.
* @throws IOException
*/
private static XmlTextAnnotation getNameTextAnnotation(File file) throws IOException {
String document = LineIO.slurp(file.getCanonicalPath());
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
ta.addView(ViewNames.NER_ONTONOTES, nerView);
return xta;
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class OntonotesNerReaderExample method main.
public static void main(String[] args) throws ClassNotFoundException, SQLException, IOException {
String inFile = "/shared/corpora/corporaWeb/multi-mode/multi/ontonotes-release-5.0/data/files/data/english/annotations/nw/wsj/00/wsj_0061.name";
// make sure the output directory exists.
// "en"
int counter = 0;
long start = System.currentTimeMillis();
// define all tags with text.
Set<String> tagsWithText = new HashSet<>();
// define the attributes we want to keep for the tags we have.
Map<String, Set<String>> tagsWithAtts = new HashMap<>();
{
Set<String> docAttrs = new HashSet<>();
docAttrs.add("docno");
tagsWithAtts.put("doc", docAttrs);
}
{
Set<String> nameAttrs = new HashSet<>();
nameAttrs.add("type");
tagsWithAtts.put("enamex", nameAttrs);
}
boolean throwExceptionOnXmlParseFail = true;
// we keep everything.
Set<String> dropTags = new HashSet<>();
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
String document = LineIO.slurp(inFile);
XmlTextAnnotation xta = xtam.createTextAnnotation(document, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
System.out.println(ta + "\n");
View nerView = new SpanLabelView(ViewNames.NER_ONTONOTES, ta);
String cleanText = ta.getText();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
String neLabel = si.attributes.get("type").getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
System.err.println("ne string: '" + cleanText.substring(cleanTextCharStart, cleanTextCharEnd) + "'");
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
counter++;
System.out.println("Read " + counter + " documents in " + (System.currentTimeMillis() - start));
System.out.println(nerView.toString());
}
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ExampleUsage method SemEvalAnnotate.
public static void SemEvalAnnotate() {
String text = "People have been moving back into downtown.";
String corpus = "semeval";
String textId = "001";
// Create a TextAnnotation From Text
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
String modelPath = "";
FlatGazetteers gazetteers = null;
try {
ta.addView(pos_annotator);
chunker.addView(ta);
stanfordDepHandler.addView(ta);
Datastore ds = new Datastore(new ResourceConfigurator().getDefaultConfig());
File model = ds.getDirectory("org.cogcomp.re", "SEMEVAL", 1.1, false);
modelPath = model.getPath();
File gazetteersResource = ds.getDirectory("org.cogcomp.gazetteers", "gazetteers", 1.3, false);
gazetteers = (FlatGazetteers) GazetteersFactory.get(5, gazetteersResource.getPath() + File.separator + "gazetteers", true, Language.English);
WordNetManager.loadConfigAsClasspathResource(true);
WordNetManager wordnet = WordNetManager.getInstance();
View annotatedTokenView = new SpanLabelView("RE_ANNOTATED", ta);
for (Constituent co : ta.getView(ViewNames.TOKENS).getConstituents()) {
Constituent c = co.cloneForNewView("RE_ANNOTATED");
for (String s : co.getAttributeKeys()) {
c.addAttribute(s, co.getAttribute(s));
}
c.addAttribute("WORDNETTAG", BIOFeatureExtractor.getWordNetTags(wordnet, c));
c.addAttribute("WORDNETHYM", BIOFeatureExtractor.getWordNetHyms(wordnet, c));
annotatedTokenView.addConstituent(c);
}
ta.addView("RE_ANNOTATED", annotatedTokenView);
} catch (Exception e) {
e.printStackTrace();
}
Constituent source = new Constituent("first", "Mention", ta, 0, 1);
Constituent target = new Constituent("second", "Mention", ta, 6, 7);
source.addAttribute("GAZ", gazetteers.annotatePhrase(source));
target.addAttribute("GAZ", gazetteers.annotatePhrase(target));
Relation relation = new Relation("TEST", source, target, 1.0f);
String prefix = modelPath + File.separator + "SEMEVAL" + File.separator + "SEMEVAL";
semeval_relation_classifier classifier = new semeval_relation_classifier(prefix + ".lc", prefix + ".lex");
String tag = classifier.discreteValue(relation);
System.out.println(tag);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ExampleUsage method AnnotatorExample.
public static void AnnotatorExample() {
String text = "He went to Chicago after his Father moved there.";
String corpus = "story";
String textId = "001";
// Create a TextAnnotation From Text
TextAnnotationBuilder stab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = stab.createTextAnnotation(corpus, textId, text);
POSAnnotator pos_annotator = new POSAnnotator();
ChunkerAnnotator chunker = new ChunkerAnnotator(true);
chunker.initialize(new ChunkerConfigurator().getDefaultConfig());
Properties stanfordProps = new Properties();
stanfordProps.put("annotators", "pos, parse");
stanfordProps.put("parse.originalDependencies", true);
stanfordProps.put("parse.maxlen", Stanford331Configurator.STFRD_MAX_SENTENCE_LENGTH);
stanfordProps.put("parse.maxtime", Stanford331Configurator.STFRD_TIME_PER_SENTENCE);
POSTaggerAnnotator posAnnotator = new POSTaggerAnnotator("pos", stanfordProps);
ParserAnnotator parseAnnotator = new ParserAnnotator("parse", stanfordProps);
StanfordDepHandler stanfordDepHandler = new StanfordDepHandler(posAnnotator, parseAnnotator);
RelationAnnotator relationAnnotator = new RelationAnnotator();
try {
ta.addView(pos_annotator);
chunker.addView(ta);
stanfordDepHandler.addView(ta);
relationAnnotator.addView(ta);
} catch (Exception e) {
e.printStackTrace();
}
View mentionView = ta.getView(ViewNames.MENTION);
List<Constituent> predictedMentions = mentionView.getConstituents();
List<Relation> predictedRelations = mentionView.getRelations();
for (Relation r : predictedRelations) {
IOHelper.printRelation(r);
}
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class Preprocess method runNER.
public TextAnnotation runNER(String s) {
TextAnnotationBuilder tab;
boolean splitOnHyphens = false;
tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHyphens, false));
TextAnnotation ta = tab.createTextAnnotation("001", "001", s);
try {
co.getView(ta);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return ta;
}
Aggregations