Search in sources :

Example 21 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class ExternalAnnotatorServiceFactory method buildPipeline.

/**
     * create an AnnotatorService with components specified by the ResourceManager (to override
     * defaults in {@link ExternalToolsConfigurator}
     *
     * @param rm non-default config options
     * @return AnnotatorService with specified NLP components
     * @throws IOException
     * @throws AnnotatorException
     */
public static BasicAnnotatorService buildPipeline(ResourceManager rm) throws IOException, AnnotatorException {
    // Merges default configuration with the user-specified overrides.
    ResourceManager fullRm = (new ExternalToolsConfigurator()).getConfig(rm);
    Boolean splitOnDash = fullRm.getBoolean(ExternalToolsConfigurator.SPLIT_ON_DASH);
    boolean isSentencePipeline = fullRm.getBoolean(ExternalToolsConfigurator.USE_SENTENCE_PIPELINE.key);
    TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnDash));
    Map<String, Annotator> annotators = buildAnnotators();
    return isSentencePipeline ? new SentencePipeline(taBldr, annotators, fullRm) : new BasicAnnotatorService(taBldr, annotators, fullRm);
}
Also used : TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) ExternalToolsConfigurator(edu.illinois.cs.cogcomp.pipeline.common.ExternalToolsConfigurator) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)

Example 22 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class MainClass method annotate.

private static void annotate(String filepath) throws IOException {
    DepAnnotator annotator = new DepAnnotator();
    TextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(true, false));
    Preprocessor preprocessor = new Preprocessor();
    Files.lines(Paths.get(filepath)).forEach(line -> {
        TextAnnotation ta = taBuilder.createTextAnnotation(line);
        try {
            preprocessor.annotate(ta);
            annotator.addView(ta);
            System.out.println(ta.getView(annotator.getViewName()).toString());
        } catch (AnnotatorException e) {
            e.printStackTrace();
        }
    });
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) Preprocessor(edu.illinois.cs.cogcomp.depparse.io.Preprocessor) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)

Example 23 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class W2VDatalessAnnotator method main.

/**
 * @param args config: config file path testFile: Test File
 */
public static void main(String[] args) {
    CommandLine cmd = ESADatalessAnnotator.getCMDOpts(args);
    ResourceManager rm;
    try {
        String configFile = cmd.getOptionValue("config", "config/project.properties");
        ResourceManager nonDefaultRm = new ResourceManager(configFile);
        rm = new W2VDatalessConfigurator().getConfig(nonDefaultRm);
    } catch (IOException e) {
        rm = new W2VDatalessConfigurator().getDefaultConfig();
    }
    String testFile = cmd.getOptionValue("testFile", "data/graphicsTestDocument.txt");
    StringBuilder sb = new StringBuilder();
    String line;
    try (BufferedReader br = new BufferedReader(new FileReader(new File(testFile)))) {
        while ((line = br.readLine()) != null) {
            sb.append(line);
            sb.append(" ");
        }
        String text = sb.toString().trim();
        TokenizerTextAnnotationBuilder taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
        TextAnnotation ta = taBuilder.createTextAnnotation(text);
        W2VDatalessAnnotator datalessAnnotator = new W2VDatalessAnnotator(rm);
        datalessAnnotator.addView(ta);
        List<Constituent> annots = ta.getView(ViewNames.DATALESS_W2V).getConstituents();
        System.out.println("Predicted LabelIDs:");
        for (Constituent annot : annots) {
            System.out.println(annot.getLabel());
        }
        Map<String, String> labelNameMap = DatalessAnnotatorUtils.getLabelNameMap(rm.getString(DatalessConfigurator.LabelName_Path.key));
        System.out.println("Predicted Labels:");
        for (Constituent annot : annots) {
            System.out.println(labelNameMap.get(annot.getLabel()));
        }
    } catch (FileNotFoundException e) {
        e.printStackTrace();
        logger.error("Test File not found at " + testFile + " ... exiting");
        System.exit(-1);
    } catch (AnnotatorException e) {
        e.printStackTrace();
        logger.error("Error Annotating the Test Document with the Dataless View ... exiting");
        System.exit(-1);
    } catch (IOException e) {
        e.printStackTrace();
        logger.error("IO Error while reading the test file ... exiting");
        System.exit(-1);
    }
}
Also used : AnnotatorException(edu.illinois.cs.cogcomp.annotation.AnnotatorException) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) CommandLine(org.apache.commons.cli.CommandLine) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer) W2VDatalessConfigurator(edu.illinois.cs.cogcomp.datalessclassification.config.W2VDatalessConfigurator) TextAnnotation(edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation) Constituent(edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)

Example 24 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class MultiLingualTokenizer method getTokenizer.

public static TextAnnotationBuilder getTokenizer(String lang) {
    if (tokenizerMap == null)
        tokenizerMap = new HashMap<>();
    if (!tokenizerMap.containsKey(lang)) {
        TextAnnotationBuilder tokenizer = null;
        if (lang.equals("en"))
            tokenizer = new TokenizerTextAnnotationBuilder(new StatefulTokenizer());
        else if (lang.equals("es"))
            tokenizer = new TokenizerTextAnnotationBuilder(new StanfordAnalyzer());
        else if (lang.equals("zh"))
            tokenizer = new TokenizerTextAnnotationBuilder(new CharacterTokenizer());
        else if (lang.equals("th"))
            tokenizer = new TokenizerTextAnnotationBuilder(new ThaiTokenizer());
        else if (lang.equals("ja"))
            tokenizer = new TokenizerTextAnnotationBuilder(new JapaneseTokenizer());
        else
            tokenizer = new TokenizerTextAnnotationBuilder(new WhiteSpaceTokenizer());
        tokenizerMap.put(lang, tokenizer);
    }
    return tokenizerMap.get(lang);
}
Also used : TextAnnotationBuilder(edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) HashMap(java.util.HashMap) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)

Example 25 with TokenizerTextAnnotationBuilder

use of edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder in project cogcomp-nlp by CogComp.

the class PipelineFactory method buildPipeline.

/**
 * create an AnnotatorService with the given view names in the argument. The names are supposed
 * be strings, separated by space.
 *
 * @return AnnotatorService with specified NLP components
 * @throws IOException
 * @throws AnnotatorException
 */
public static BasicAnnotatorService buildPipeline(Boolean disableCache, String... views) throws IOException, AnnotatorException {
    List<String> allViewNames = ViewNames.getAllViewNames();
    Map<String, String> nonDefaultValues = new HashMap<>();
    for (String vu : views) {
        if (allViewNames.contains(vu)) {
            switch(vu) {
                case ViewNames.POS:
                    nonDefaultValues.put(PipelineConfigurator.USE_POS.key, Configurator.TRUE);
                    break;
                case ViewNames.LEMMA:
                    nonDefaultValues.put(PipelineConfigurator.USE_LEMMA.key, Configurator.TRUE);
                    break;
                case ViewNames.NER_CONLL:
                    nonDefaultValues.put(PipelineConfigurator.USE_NER_CONLL.key, Configurator.TRUE);
                    break;
                case ViewNames.NER_ONTONOTES:
                    nonDefaultValues.put(PipelineConfigurator.USE_NER_ONTONOTES.key, Configurator.TRUE);
                    break;
                case ViewNames.QUANTITIES:
                    nonDefaultValues.put(PipelineConfigurator.USE_QUANTIFIER.key, Configurator.TRUE);
                    break;
                case ViewNames.SHALLOW_PARSE:
                    nonDefaultValues.put(PipelineConfigurator.USE_SHALLOW_PARSE.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_VERB:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_VERB.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_NOM:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_NOM.key, Configurator.TRUE);
                    break;
                case ViewNames.DEPENDENCY_STANFORD:
                    nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_DEP.key, Configurator.TRUE);
                    break;
                case ViewNames.DEPENDENCY:
                    nonDefaultValues.put(PipelineConfigurator.USE_DEP.key, Configurator.TRUE);
                    break;
                case ViewNames.PARSE_STANFORD:
                    nonDefaultValues.put(PipelineConfigurator.USE_STANFORD_PARSE.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_PREP:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_PREP.key, Configurator.TRUE);
                    break;
                case ViewNames.SRL_COMMA:
                    nonDefaultValues.put(PipelineConfigurator.USE_SRL_COMMA.key, Configurator.TRUE);
                    break;
                case ViewNames.VERB_SENSE:
                    nonDefaultValues.put(PipelineConfigurator.USE_VERB_SENSE.key, Configurator.TRUE);
                    break;
                case ViewNames.TRANSLITERATION:
                    nonDefaultValues.put(PipelineConfigurator.USE_TRANSLITERATION.key, Configurator.TRUE);
                    break;
                case ViewNames.TIMEX3:
                    nonDefaultValues.put(PipelineConfigurator.USE_TIMEX3.key, Configurator.TRUE);
                    break;
                case ViewNames.MENTION:
                    nonDefaultValues.put(PipelineConfigurator.USE_MENTION.key, Configurator.TRUE);
                    break;
                case ViewNames.RELATION:
                    nonDefaultValues.put(PipelineConfigurator.USE_RELATION.key, Configurator.TRUE);
                    break;
                case ViewNames.DATALESS_ESA:
                    nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_ESA.key, Configurator.TRUE);
                    break;
                case ViewNames.DATALESS_W2V:
                    nonDefaultValues.put(PipelineConfigurator.USE_DATALESS_W2V.key, Configurator.TRUE);
                    break;
                case ViewNames.QUESTION_TYPE:
                    nonDefaultValues.put(PipelineConfigurator.USE_QUESTION_TYPER.key, Configurator.TRUE);
                    break;
                default:
                    logger.warn("View name " + vu + " is not supported yet. Look into the readme of the pipeline to see the list of valid annotators. ");
            }
        } else {
            throw new IllegalArgumentException("The view name " + vu + " is not a valid view name. " + "The possible view names are static members of the class `ViewName`. ");
        }
    }
    if (disableCache) {
        nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.TRUE);
    } else {
        nonDefaultValues.put(AnnotatorServiceConfigurator.DISABLE_CACHE.key, Configurator.FALSE);
    }
    // using the default settings and changing the views
    ResourceManager fullRm = (new PipelineConfigurator()).getConfig(new Stanford331Configurator().getConfig(nonDefaultValues));
    boolean splitOnHypen = fullRm.getBoolean(PipelineConfigurator.SPLIT_ON_DASH.key);
    TextAnnotationBuilder taBldr = new TokenizerTextAnnotationBuilder(new StatefulTokenizer(splitOnHypen, false));
    Map<String, Annotator> annotators = buildAnnotators(fullRm);
    return new SentencePipeline(taBldr, annotators, fullRm);
}
Also used : Stanford331Configurator(edu.illinois.cs.cogcomp.pipeline.common.Stanford331Configurator) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) HashMap(java.util.HashMap) ResourceManager(edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager) PipelineConfigurator(edu.illinois.cs.cogcomp.pipeline.common.PipelineConfigurator) TemporalChunkerAnnotator(edu.illinois.cs.cogcomp.temporal.normalizer.main.TemporalChunkerAnnotator) W2VDatalessAnnotator(edu.illinois.cs.cogcomp.datalessclassification.ta.W2VDatalessAnnotator) RelationAnnotator(org.cogcomp.re.RelationAnnotator) ESADatalessAnnotator(edu.illinois.cs.cogcomp.datalessclassification.ta.ESADatalessAnnotator) POSTaggerAnnotator(edu.stanford.nlp.pipeline.POSTaggerAnnotator) PrepSRLAnnotator(edu.illinois.cs.cogcomp.prepsrl.PrepSRLAnnotator) ParserAnnotator(edu.stanford.nlp.pipeline.ParserAnnotator) MentionAnnotator(org.cogcomp.md.MentionAnnotator) QuestionTypeAnnotator(edu.illinois.cs.cogcomp.question_typer.QuestionTypeAnnotator) ChunkerAnnotator(edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator) DepAnnotator(edu.illinois.cs.cogcomp.depparse.DepAnnotator) POSAnnotator(edu.illinois.cs.cogcomp.pos.POSAnnotator) VerbSenseAnnotator(edu.illinois.cs.cogcomp.verbsense.VerbSenseAnnotator) NERAnnotator(edu.illinois.cs.cogcomp.ner.NERAnnotator) TokenizerTextAnnotationBuilder(edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder) StatefulTokenizer(edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)

Aggregations

TokenizerTextAnnotationBuilder (edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder)42 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)31 StatefulTokenizer (edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer)29 TextAnnotationBuilder (edu.illinois.cs.cogcomp.annotation.TextAnnotationBuilder)19 Test (org.junit.Test)16 ResourceManager (edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager)12 Constituent (edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent)11 AnnotatorException (edu.illinois.cs.cogcomp.annotation.AnnotatorException)9 POSAnnotator (edu.illinois.cs.cogcomp.pos.POSAnnotator)9 Properties (java.util.Properties)7 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)6 ChunkerAnnotator (edu.illinois.cs.cogcomp.chunker.main.ChunkerAnnotator)5 View (edu.illinois.cs.cogcomp.core.datastructures.textannotation.View)5 POSTaggerAnnotator (edu.stanford.nlp.pipeline.POSTaggerAnnotator)5 ParserAnnotator (edu.stanford.nlp.pipeline.ParserAnnotator)5 XmlTextAnnotationMaker (edu.illinois.cs.cogcomp.annotation.XmlTextAnnotationMaker)4 XmlDocumentProcessor (edu.illinois.cs.cogcomp.core.utilities.XmlDocumentProcessor)4 ChunkerConfigurator (edu.illinois.cs.cogcomp.chunker.main.ChunkerConfigurator)3 XmlTextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.XmlTextAnnotation)3 SpanLabelView (edu.illinois.cs.cogcomp.core.datastructures.textannotation.SpanLabelView)2