use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class PipelineFactory method buildPipeline.
/**
* create an AnnotatorService with the given view names in the argument. The names are supposed
* be strings, separated by space.
*
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(Boolean disableCache, String... views) throws IOException, AnnotatorException {
List<String> allViewNames = ViewNames.getAllViewNames();
Map<String, String> nonDefaultValues = new HashMap<>();
for (String vu : views) {
if (allViewNames.contains(vu)) {
switch(vu) {
case ViewNames.POS:
nonDefaultValues.put(PipelineConfigurator.USE_POS.key, Configurator.TRUE);
break;
case ViewNames.LEMMA:
nonDefaultValues.put(PipelineConfigurator.USE_LEMMA.key, Configurator.TRUE);
break;
case ViewNames.NER_CONLL:
nonDefaultValues.put(PipelineConfigurator.USE_NER_CONLL.key, Configurator.TRUE);
break;
case ViewNames.NER_ONTONOTES:
nonDefaultValues.put(PipelineConfigurator.USE_NER_ONTONOTES.key, Configurator.TRUE);
break;
case ViewNames.QUANTITIES:
nonDefaultValues.put(PipelineConfigurator.USE_QUANTIFIER.key, Configurator.TRUE);
break;
case ViewNames.SHALLOW_PARSE:
nonDefaultValues.put(PipelineConfigurator.USE_SHALLOW_PARSE.key, Configurator.TRUE);
break;
case ViewNames.SRL_VERB:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_VERB.key, Configurator.TRUE);
break;
case ViewNames.SRL_NOM:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_NOM.key, Configurator.TRUE);
break;
case ViewNames.DEPENDENCY_STANFORD:
nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_DEP.key, Configurator.TRUE);
break;
case ViewNames.DEPENDENCY:
nonDefaultValues.put(PipelineConfigurator.USE_DEP.key, Configurator.TRUE);
break;
case ViewNames.PARSE_STANFORD:
nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_PARSE.key, Configurator.TRUE);
break;
case ViewNames.SRL_PREP:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_PREP.key, Configurator.TRUE);
break;
case ViewNames.SRL_COMMA:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_COMMA.key, Configurator.TRUE);
break;
case ViewNames.VERB_SENSE:
nonDefaultValues.put(PipelineConfigurator.USE_VERB_SENSE.key, Configurator.TRUE);
break;
case ViewNames.TRANSLITERATION:
nonDefaultValues.put(PipelineConfigurator.USE_TRANSLITERATION.key, Configurator.TRUE);
break;
case ViewNames.TIMEX3:
nonDefaultValues.put(PipelineConfigurator.USE_TIMEX3.key, Configurator.TRUE);
break;
case ViewNames.MENTION:
nonDefaultValues.put(PipelineConfigurator.USE_MENTION.key, Configurator.TRUE);
break;
case ViewNames.RELATION:
nonDefaultValues.put(PipelineConfigurator.USE_RELATION.key, Configurator.TRUE);
break;
case ViewNames.DATALESS_ESA:
nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_ESA.key, Configurator.TRUE);
break;
case ViewNames.DATALESS_W2V:
nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_W2V.key, Configurator.TRUE);
break;
case ViewNames.QUESTION_TYPE:
nonDefaultValues.put(PipelineConfigurator.USE_QUESTION_TYPER.key, Configurator.TRUE);
break;
default:
logger.warn("View name " + vu + " is not supported yet. Look into the readme of the pipeline to see the list of valid annotators. ");
}
} else {
throw new IllegalArgumentException("The view name " + vu + " is not a valid view name. " + "The possible view names are static members of the class `ViewName`. ");
}
}
if (disableCache) {
nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.TRUE);
} else {
nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.FALSE);
}
// using the default settings and changing the views
ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(nonDefaultValues));
boolean splitOnHypen = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH.key);
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHypen, false));
Map<String, Annotator> annotators = buildAnnotators(fullRm);
return new SentencePipeline(taBldr, annotators, fullRm);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class OntonotesNamedEntityReader method nextAnnotation.
/**
* parse the pen treebank parse file, producing an annotation covering the entire file.
* @param data the data from the file, each line.
* @param docid the id representing the document name.
* @return the text annotation.
* @throws AnnotatorException
*/
private XmlTextAnnotation nextAnnotation(String data, String docid) throws AnnotatorException {
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(data, "OntoNotes 5.0", docid);
TextAnnotation ta = xta.getTextAnnotation();
List<SpanInfo> fudge = xta.getXmlMarkup();
// create the named entity vi
View nerView = new SpanLabelView(VIEW_NAME, ta);
for (SpanInfo si : fudge) {
if ("enamex".equalsIgnoreCase(si.label)) {
IntPair charOffsets = si.spanOffsets;
Pair<String, IntPair> neLabelPair = si.attributes.get("type");
String neLabel = neLabelPair.getFirst();
int cleanTextCharStart = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getFirst());
int cleanTextCharEnd = xta.getXmlSt().computeModifiedOffsetFromOriginal(charOffsets.getSecond());
int cleanTextNeTokStart = ta.getTokenIdFromCharacterOffset(cleanTextCharStart);
// StringTransformation returns one-past-the-end index; TextAnnotation maps at-the-end index
int cleanTextNeTokEnd = ta.getTokenIdFromCharacterOffset(cleanTextCharEnd - 1);
if (cleanTextNeTokStart == -1 || cleanTextNeTokEnd == -1) {
for (Constituent c : nerView.getConstituents()) {
System.err.println(c);
}
System.err.println("Something wonky in \"" + docid + "\", at " + charOffsets + ", " + cleanTextCharStart + " - " + cleanTextCharEnd + " = " + ta.text.substring(cleanTextCharStart, cleanTextCharEnd));
} else {
if (entityCounts.containsKey(neLabel)) {
entityCounts.put(neLabel, (entityCounts.get(neLabel) + 1));
} else {
entityCounts.put(neLabel, 1);
}
// constituent token indexing uses one-past-the-end
Constituent neCon = new Constituent(neLabel, nerView.getViewName(), ta, cleanTextNeTokStart, cleanTextNeTokEnd + 1);
nerView.addConstituent(neCon);
}
}
}
ta.addView(VIEW_NAME, nerView);
return xta;
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class XmlTextAnnotationMakerOntonotesTest method testNestedNames.
/**
* the edit offsets get messed up when there are nested tags.
*/
@Test
public void testNestedNames() {
String text = "He spoke with Paul <ENAMEX TYPE=\"PERSON\"><ENAMEX TYPE=\"PERSON\" E_OFF=\"1\">Paula</ENAMEX> Zahn</ENAMEX> .";
// we keep everything.
XmlDocumentProcessor xmlProcessor = new XmlDocumentProcessor(tagsWithText, tagsWithAtts, dropTags, true);
StatefulTokenizer st = new StatefulTokenizer();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(st);
XmlTextAnnotationMaker xtam = new XmlTextAnnotationMaker(taBuilder, xmlProcessor);
// read the file and create the annotation.
XmlTextAnnotation xta = xtam.createTextAnnotation(text, "OntoNotes 5.0", "test");
TextAnnotation ta = xta.getTextAnnotation();
List<XmlDocumentProcessor.SpanInfo> fudge = xta.getXmlMarkup();
StringTransformation xst = xta.getXmlSt();
for (XmlDocumentProcessor.SpanInfo si : fudge) {
int newTextStart = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getFirst());
int newTextEnd = xst.computeModifiedOffsetFromOriginal(si.spanOffsets.getSecond());
String neStr = ta.getText().substring(newTextStart, newTextEnd);
assertTrue(REF_ENTITIES.contains(neStr));
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class ESADatalessAnnotator method main.
/**
* @param args config: config file path testFile: Test File
*/
public static void main(String[] args) {
CommandLine cmd = getCMDOpts(args);
ResourceManager rm;
try {
String configFile = cmd.getOptionValue("config", "config/project.properties");
ResourceManager nonDefaultRm = new ResourceManager(configFile);
rm = new ESADatalessConfigurator().getConfig(nonDefaultRm);
} catch (IOException e) {
rm = new ESADatalessConfigurator().getDefaultConfig();
}
String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt");
StringBuilder sb = new StringBuilder();
String line;
try (BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) {
while ((line = br.readLine()) != null) {
sb.append(line);
sb.append(" ");
}
String text = sb.toString().trim();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = taBuilder.createTextAnnotation(text);
ESADatalessAnnotator datalessAnnotator = new ESADatalessAnnotator(rm);
datalessAnnotator.addView(ta);
List<Constituent> annots = ta.getView(ViewNames.DATALESS_ESA).getConstituents();
System.out.println("Predicted LabelIDs:");
for (Constituent annot : annots) {
System.out.println(annot.getLabel());
}
Map<String, String> labelNameMap = DatalessAnnotatorUtils.getLabelNameMap(rm.getString(DatalessConfigurator.LabelName_Path.key));
System.out.println("Predicted Labels:");
for (Constituent annot : annots) {
System.out.println(labelNameMap.get(annot.getLabel()));
}
} catch (FileNotFoundException e) {
e.printStackTrace();
logger.error("Test File not found at " + testFile + " ... exiting");
System.exit(-1);
} catch (IOException e) {
e.printStackTrace();
logger.error("IO Error while reading the test file ... exiting");
System.exit(-1);
} catch (AnnotatorException e) {
e.printStackTrace();
logger.error("Error Annotating the Test Document with the Dataless View ... exiting");
System.exit(-1);
}
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class W2VDatalessTest method getTextAnnotation.
private TextAnnotation getTextAnnotation(String text) {
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = taBuilder.createTextAnnotation(text);
return ta;
}
Aggregations