Search in sources :

Example 1 with TokenizerME

use of opennlp.tools.tokenize.TokenizerME in project languagetool by languagetool-org.

the class EnglishChunker method tokenize.

// non-private for test cases
String[] tokenize(String sentence) {
    TokenizerME tokenizer = new TokenizerME(tokenModel);
    // this is the type of apostrophe that OpenNLP expects
    String cleanString = sentence.replace('’', '\'');
    return tokenizer.tokenize(cleanString);
}
Also used : TokenizerME(opennlp.tools.tokenize.TokenizerME)

Example 2 with TokenizerME

use of opennlp.tools.tokenize.TokenizerME in project textdb by TextDB.

the class POSTagexample method Tokenize.

public static String[] Tokenize(String sentence) throws InvalidFormatException, IOException {
    InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-token.bin");
    TokenizerModel model = new TokenizerModel(is);
    Tokenizer tokenizer = new TokenizerME(model);
    String[] tokens = tokenizer.tokenize(sentence);
    is.close();
    return tokens;
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TokenizerME(opennlp.tools.tokenize.TokenizerME) TokenizerModel(opennlp.tools.tokenize.TokenizerModel) Tokenizer(opennlp.tools.tokenize.Tokenizer) FileInputStream(java.io.FileInputStream)

Example 3 with TokenizerME

use of opennlp.tools.tokenize.TokenizerME in project deeplearning4j by deeplearning4j.

the class ConcurrentTokenizer method initialize.

/**
     * Initializes the current instance with the given context.
     *
     * Note: Do all initialization in this method, do not use the constructor.
     */
public void initialize(UimaContext context) throws ResourceInitializationException {
    super.initialize(context);
    TokenizerModel model;
    try {
        TokenizerModelResource modelResource = (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);
        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }
    tokenizer = new TokenizerME(model);
}
Also used : ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TokenizerModelResource(opennlp.uima.tokenize.TokenizerModelResource) TokenizerME(opennlp.tools.tokenize.TokenizerME) TokenizerModel(opennlp.tools.tokenize.TokenizerModel) ResourceAccessException(org.apache.uima.resource.ResourceAccessException)

Example 4 with TokenizerME

use of opennlp.tools.tokenize.TokenizerME in project textdb by TextDB.

the class NameFinderExample method Tokenize.

public static String[] Tokenize(String sentence) throws InvalidFormatException, IOException {
    InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-token.bin");
    TokenizerModel model = new TokenizerModel(is);
    Tokenizer tokenizer = new TokenizerME(model);
    String[] tokens = tokenizer.tokenize(sentence);
    is.close();
    return tokens;
}
Also used : FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) TokenizerME(opennlp.tools.tokenize.TokenizerME) TokenizerModel(opennlp.tools.tokenize.TokenizerModel) Tokenizer(opennlp.tools.tokenize.Tokenizer) FileInputStream(java.io.FileInputStream)

Example 5 with TokenizerME

use of opennlp.tools.tokenize.TokenizerME in project stanbol by apache.

the class OpenNLP method getTokenizer.

/**
     * Getter for the Tokenizer of a given language. This first tries to
     * create an {@link TokenizerME} instance if the required 
     * {@link TokenizerModel} for the parsed language is available. if such a
     * model is not available it returns the {@link SimpleTokenizer} instance.
     * @param language the language or <code>null</code> to build a 
     * {@link SimpleTokenizer}
     * @return the {@link Tokenizer} for the parsed language.
     */
public Tokenizer getTokenizer(String language) {
    Tokenizer tokenizer = null;
    if (language != null) {
        try {
            TokenizerModel model = getTokenizerModel(language);
            if (model != null) {
                tokenizer = new TokenizerME(model);
            }
        } catch (InvalidFormatException e) {
            log.warn("Unable to load Tokenizer Model for " + language + ": " + "Will use Simple Tokenizer instead", e);
        } catch (IOException e) {
            log.warn("Unable to load Tokenizer Model for " + language + ": " + "Will use Simple Tokenizer instead", e);
        }
    }
    if (tokenizer == null) {
        log.debug("Use Simple Tokenizer for language {}", language);
        tokenizer = SimpleTokenizer.INSTANCE;
    } else {
        log.debug("Use ME Tokenizer for language {}", language);
    }
    return tokenizer;
}
Also used : TokenizerME(opennlp.tools.tokenize.TokenizerME) IOException(java.io.IOException) Tokenizer(opennlp.tools.tokenize.Tokenizer) SimpleTokenizer(opennlp.tools.tokenize.SimpleTokenizer) TokenizerModel(opennlp.tools.tokenize.TokenizerModel) InvalidFormatException(opennlp.tools.util.InvalidFormatException)

Aggregations

TokenizerME (opennlp.tools.tokenize.TokenizerME)5 TokenizerModel (opennlp.tools.tokenize.TokenizerModel)4 Tokenizer (opennlp.tools.tokenize.Tokenizer)3 FileInputStream (java.io.FileInputStream)2 InputStream (java.io.InputStream)2 IOException (java.io.IOException)1 SimpleTokenizer (opennlp.tools.tokenize.SimpleTokenizer)1 InvalidFormatException (opennlp.tools.util.InvalidFormatException)1 TokenizerModelResource (opennlp.uima.tokenize.TokenizerModelResource)1 ResourceAccessException (org.apache.uima.resource.ResourceAccessException)1 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)1