use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class PipelineFactory method buildPipeline.
/**
* create an AnnotatorService with the given view names in the argument. The names are supposed
* be strings, separated by space.
*
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(Boolean disableCache, String... views) throws IOException, AnnotatorException {
List<String> allViewNames = ViewNames.getAllViewNames();
Map<String, String> nonDefaultValues = new HashMap<>();
for (String vu : views) {
if (allViewNames.contains(vu)) {
switch(vu) {
case ViewNames.POS:
nonDefaultValues.put(PipelineConfigurator.USE_POS.key, Configurator.TRUE);
break;
case ViewNames.LEMMA:
nonDefaultValues.put(PipelineConfigurator.USE_LEMMA.key, Configurator.TRUE);
break;
case ViewNames.NER_CONLL:
nonDefaultValues.put(PipelineConfigurator.USE_NER_CONLL.key, Configurator.TRUE);
break;
case ViewNames.NER_ONTONOTES:
nonDefaultValues.put(PipelineConfigurator.USE_NER_ONTONOTES.key, Configurator.TRUE);
break;
case ViewNames.QUANTITIES:
nonDefaultValues.put(PipelineConfigurator.USE_QUANTIFIER.key, Configurator.TRUE);
break;
case ViewNames.SHALLOW_PARSE:
nonDefaultValues.put(PipelineConfigurator.USE_SHALLOW_PARSE.key, Configurator.TRUE);
break;
case ViewNames.SRL_VERB:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_VERB.key, Configurator.TRUE);
break;
case ViewNames.DEPENDENCY_STANFORD:
nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_DEP.key, Configurator.TRUE);
break;
case ViewNames.DEPENDENCY:
nonDefaultValues.put(PipelineConfigurator.USE_DEP.key, Configurator.TRUE);
break;
case ViewNames.PARSE_STANFORD:
nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_PARSE.key, Configurator.TRUE);
break;
case ViewNames.SRL_PREP:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_PREP.key, Configurator.TRUE);
break;
case ViewNames.SRL_COMMA:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_COMMA.key, Configurator.TRUE);
break;
default:
logger.warn("View name " + vu + " is not supported yet. Look into the readme of the pipeline to see the list of valid annotators. ");
}
} else {
throw new IllegalArgumentException("The view name " + vu + " is not a valid view name. " + "The possible view names are static members of the class `ViewName`. ");
}
}
if (disableCache) {
nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.TRUE);
} else {
nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.FALSE);
}
// using the default settings and changing the views
ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(nonDefaultValues));
boolean splitOnHypen = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH.key);
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHypen));
Map<String, Annotator> annotators = buildAnnotators(fullRm);
return new SentencePipeline(taBldr, annotators, fullRm);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class NerOntonotesTest method testOntonotesNer.
@Test
public void testOntonotesNer() {
TextAnnotationBuilder tab = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
Properties props = new Properties();
NERAnnotator nerOntonotes = NerAnnotatorManager.buildNerAnnotator(new ResourceManager(props), ViewNames.NER_ONTONOTES);
TextAnnotation taOnto = tab.createTextAnnotation("", "", TEST_INPUT);
try {
nerOntonotes.getView(taOnto);
} catch (AnnotatorException e) {
e.printStackTrace();
fail(e.getMessage());
}
View v = taOnto.getView(nerOntonotes.getViewName());
assertEquals(v.getConstituents().size(), 4);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class MainClass method annotate.
private static void annotate(String filepath) throws IOException {
DepAnnotator annotator = new DepAnnotator();
TextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true));
Preprocessor preprocessor = new Preprocessor();
Files.lines(Paths.get(filepath)).forEach(line -> {
TextAnnotation ta = taBuilder.createTextAnnotation(line);
try {
preprocessor.annotate(ta);
annotator.addView(ta);
System.out.println(ta.getView(annotator.getViewName()).toString());
} catch (AnnotatorException e) {
e.printStackTrace();
}
});
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class ExternalAnnotatorServiceFactory method buildPipeline.
/**
* create an AnnotatorService with components specified by the ResourceManager (to override
* defaults in {@link ExternalToolsConfigurator}
*
* @param rm non-default config options
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(ResourceManager rm) throws IOException, AnnotatorException {
// Merges default configuration with the user-specified overrides.
ResourceManager fullRm = (new ExternalToolsConfigurator()).getConfig(rm);
Boolean splitOnDash = fullRm.getBoolean(ExternalToolsConfigurator.SPLIT_ON_DASH);
boolean isSentencePipeline = fullRm.getBoolean(ExternalToolsConfigurator.USE_SENTENCE_PIPELINE.key);
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnDash));
Map<String, Annotator> annotators = buildAnnotators();
return isSentencePipeline ? new SentencePipeline(taBldr, annotators, fullRm) : new BasicAnnotatorService(taBldr, annotators, fullRm);
}
use of edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer in project cogcomp-nlp by CogComp.
the class MultiLingualTokenizer method getTokenizer.
public static TextAnnotationBuilder getTokenizer(String lang) {
if (tokenizerMap == null)
tokenizerMap = new HashMap<>();
if (!tokenizerMap.containsKey(lang)) {
TextAnnotationBuilder tokenizer = null;
if (lang.equals("en"))
tokenizer = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
else if (lang.equals("es"))
tokenizer = new TokenizerTextAnnotationBuilder(new StanfordAnalyzer());
else if (lang.equals("zh"))
tokenizer = new TokenizerTextAnnotationBuilder(new CharacterTokenizer());
else if (lang.equals("th"))
tokenizer = new TokenizerTextAnnotationBuilder(new ThaiTokenizer());
else
tokenizer = new TokenizerTextAnnotationBuilder(new WhiteSpaceTokenizer());
tokenizerMap.put(lang, tokenizer);
}
return tokenizerMap.get(lang);
}
Aggregations