use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class ExternalAnnotatorServiceFactory method buildPipeline.
/**
* create an AnnotatorService with components specified by the ResourceManager (to override
* defaults in {@link ExternalToolsConfigurator}
*
* @param rm non-default config options
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(ResourceManager rm) throws IOException, AnnotatorException {
// Merges default configuration with the user-specified overrides.
ResourceManager fullRm = (new ExternalToolsConfigurator()).getConfig(rm);
Boolean splitOnDash = fullRm.getBoolean(ExternalToolsConfigurator.SPLIT_ON_DASH);
boolean isSentencePipeline = fullRm.getBoolean(ExternalToolsConfigurator.USE_SENTENCE_PIPELINE.key);
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnDash));
Map<String, Annotator> annotators = buildAnnotators();
return isSentencePipeline ? new SentencePipeline(taBldr, annotators, fullRm) : new BasicAnnotatorService(taBldr, annotators, fullRm);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class MainClass method annotate.
private static void annotate(String filepath) throws IOException {
DepAnnotator annotator = new DepAnnotator();
TextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
Preprocessor preprocessor = new Preprocessor();
Files.lines(Paths.get(filepath)).forEach(line -> {
TextAnnotation ta = taBuilder.createTextAnnotation(line);
try {
preprocessor.annotate(ta);
annotator.addView(ta);
System.out.println(ta.getView(annotator.getViewName()).toString());
} catch (AnnotatorException e) {
e.printStackTrace();
}
});
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class W2VDatalessAnnotator method main.
/**
* @param args config: config file path testFile: Test File
*/
public static void main(String[] args) {
CommandLine cmd = ESADatalessAnnotator.getCMDOpts(args);
ResourceManager rm;
try {
String configFile = cmd.getOptionValue("config", "config/project.properties");
ResourceManager nonDefaultRm = new ResourceManager(configFile);
rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm);
} catch (IOException e) {
rm = new W2VDatalessConfigurator().getDefaultConfig();
}
String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt");
StringBuilder sb = new StringBuilder();
String line;
try (BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) {
while ((line = br.readLine()) != null) {
sb.append(line);
sb.append(" ");
}
String text = sb.toString().trim();
TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
TextAnnotation ta = taBuilder.createTextAnnotation(text);
W2VDatalessAnnotator datalessAnnotator = new W2VDatalessAnnotator(rm);
datalessAnnotator.addView(ta);
List<Constituent> annots = ta.getView(ViewNames.DATALESS_W2V).getConstituents();
System.out.println("Predicted LabelIDs:");
for (Constituent annot : annots) {
System.out.println(annot.getLabel());
}
Map<String, String> labelNameMap = DatalessAnnotatorUtils.getLabelNameMap(rm.getString(DatalessConfigurator.LabelName_Path.key));
System.out.println("Predicted Labels:");
for (Constituent annot : annots) {
System.out.println(labelNameMap.get(annot.getLabel()));
}
} catch (FileNotFoundException e) {
e.printStackTrace();
logger.error("Test File not found at " + testFile + " ... exiting");
System.exit(-1);
} catch (AnnotatorException e) {
e.printStackTrace();
logger.error("Error Annotating the Test Document with the Dataless View ... exiting");
System.exit(-1);
} catch (IOException e) {
e.printStackTrace();
logger.error("IO Error while reading the test file ... exiting");
System.exit(-1);
}
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class MultiLingualTokenizer method getTokenizer.
public static TextAnnotationBuilder getTokenizer(String lang) {
if (tokenizerMap == null)
tokenizerMap = new HashMap<>();
if (!tokenizerMap.containsKey(lang)) {
TextAnnotationBuilder tokenizer = null;
if (lang.equals("en"))
tokenizer = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
else if (lang.equals("es"))
tokenizer = new TokenizerTextAnnotationBuilder(new StanfordAnalyzer());
else if (lang.equals("zh"))
tokenizer = new TokenizerTextAnnotationBuilder(new CharacterTokenizer());
else if (lang.equals("th"))
tokenizer = new TokenizerTextAnnotationBuilder(new ThaiTokenizer());
else if (lang.equals("ja"))
tokenizer = new TokenizerTextAnnotationBuilder(new JapaneseTokenizer());
else
tokenizer = new TokenizerTextAnnotationBuilder(new WhiteSpaceTokenizer());
tokenizerMap.put(lang, tokenizer);
}
return tokenizerMap.get(lang);
}
use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.
the class PipelineFactory method buildPipeline.
/**
* create an AnnotatorService with the given view names in the argument. The names are supposed
* be strings, separated by space.
*
* @return AnnotatorService with specified NLP components
* @throws IOException
* @throws AnnotatorException
*/
public static BasicAnnotatorService buildPipeline(Boolean disableCache, String... views) throws IOException, AnnotatorException {
List<String> allViewNames = ViewNames.getAllViewNames();
Map<String, String> nonDefaultValues = new HashMap<>();
for (String vu : views) {
if (allViewNames.contains(vu)) {
switch(vu) {
case ViewNames.POS:
nonDefaultValues.put(PipelineConfigurator.USE_POS.key, Configurator.TRUE);
break;
case ViewNames.LEMMA:
nonDefaultValues.put(PipelineConfigurator.USE_LEMMA.key, Configurator.TRUE);
break;
case ViewNames.NER_CONLL:
nonDefaultValues.put(PipelineConfigurator.USE_NER_CONLL.key, Configurator.TRUE);
break;
case ViewNames.NER_ONTONOTES:
nonDefaultValues.put(PipelineConfigurator.USE_NER_ONTONOTES.key, Configurator.TRUE);
break;
case ViewNames.QUANTITIES:
nonDefaultValues.put(PipelineConfigurator.USE_QUANTIFIER.key, Configurator.TRUE);
break;
case ViewNames.SHALLOW_PARSE:
nonDefaultValues.put(PipelineConfigurator.USE_SHALLOW_PARSE.key, Configurator.TRUE);
break;
case ViewNames.SRL_VERB:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_VERB.key, Configurator.TRUE);
break;
case ViewNames.SRL_NOM:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_NOM.key, Configurator.TRUE);
break;
case ViewNames.DEPENDENCY_STANFORD:
nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_DEP.key, Configurator.TRUE);
break;
case ViewNames.DEPENDENCY:
nonDefaultValues.put(PipelineConfigurator.USE_DEP.key, Configurator.TRUE);
break;
case ViewNames.PARSE_STANFORD:
nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_PARSE.key, Configurator.TRUE);
break;
case ViewNames.SRL_PREP:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_PREP.key, Configurator.TRUE);
break;
case ViewNames.SRL_COMMA:
nonDefaultValues.put(PipelineConfigurator.USE_SRL_COMMA.key, Configurator.TRUE);
break;
case ViewNames.VERB_SENSE:
nonDefaultValues.put(PipelineConfigurator.USE_VERB_SENSE.key, Configurator.TRUE);
break;
case ViewNames.TRANSLITERATION:
nonDefaultValues.put(PipelineConfigurator.USE_TRANSLITERATION.key, Configurator.TRUE);
break;
case ViewNames.TIMEX3:
nonDefaultValues.put(PipelineConfigurator.USE_TIMEX3.key, Configurator.TRUE);
break;
case ViewNames.MENTION:
nonDefaultValues.put(PipelineConfigurator.USE_MENTION.key, Configurator.TRUE);
break;
case ViewNames.RELATION:
nonDefaultValues.put(PipelineConfigurator.USE_RELATION.key, Configurator.TRUE);
break;
case ViewNames.DATALESS_ESA:
nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_ESA.key, Configurator.TRUE);
break;
case ViewNames.DATALESS_W2V:
nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_W2V.key, Configurator.TRUE);
break;
case ViewNames.QUESTION_TYPE:
nonDefaultValues.put(PipelineConfigurator.USE_QUESTION_TYPER.key, Configurator.TRUE);
break;
default:
logger.warn("View name " + vu + " is not supported yet. Look into the readme of the pipeline to see the list of valid annotators. ");
}
} else {
throw new IllegalArgumentException("The view name " + vu + " is not a valid view name. " + "The possible view names are static members of the class `ViewName`. ");
}
}
if (disableCache) {
nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.TRUE);
} else {
nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.FALSE);
}
// using the default settings and changing the views
ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(nonDefaultValues));
boolean splitOnHypen = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH.key);
TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHypen, false));
Map<String, Annotator> annotators = buildAnnotators(fullRm);
return new SentencePipeline(taBldr, annotators, fullRm);
}
Aggregations