Search in sources :

Example 1 with TokenNameFinderModel

use of in project elasticsearch-opennlp-plugin by spinscale.

the class OpenNlpService method tokenize.

public Map<String, Set<String>> tokenize(String content) {
    Map<String, Set<String>> namedEntities = Maps.newHashMap();
    List<TextAnnotation> allTextAnnotations = new ArrayList<TextAnnotation>();
    String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
    for (Map.Entry<String, TokenNameFinderModel> finderEntry : finders.entrySet()) {
        String type = finderEntry.getKey();
        NameFinderME finder = new NameFinderME(finderEntry.getValue());
        Span[] spans = finder.find(tokens);
        double[] probs = finder.probs(spans);
        for (int ni = 0; ni < spans.length; ni++) {
            allTextAnnotations.add(new TextAnnotation(type, spans[ni], probs[ni]));
    if (allTextAnnotations.size() > 0) {
    convertTextAnnotationsToNamedEntities(tokens, allTextAnnotations, namedEntities);
    return namedEntities;
Also used : TokenNameFinderModel( PooledTokenNameFinderModel(org.elasticsearch.service.opennlp.models.PooledTokenNameFinderModel) Span( NameFinderME( TextAnnotation(org.elasticsearch.service.opennlp.models.TextAnnotation)

Example 2 with TokenNameFinderModel

use of in project stanbol by apache.

the class NEREngineCore method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    //first check the langauge before processing the content (text)
    String language = extractLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    if (!isNerModel(language)) {
        throw new IllegalStateException("For the language '" + language + "' of ContentItem " + ci.getUri() + " no NER model is configured: This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
    //validate data in the AnalysedText
    final String text;
    if (at != null && at.getTokens().hasNext()) {
        //if the AnalysedText is present and tokens are present
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
        text = null;
    } else {
        //no AnalysedText with tokens ...
        //fallback to processing the plain text is still supported
        Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
        if (contentPart == null) {
            throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
        try {
            text = ContentItemHelper.getText(contentPart.getValue());
        } catch (IOException e) {
            throw new InvalidContentException(this, ci, e);
        if (text.trim().length() == 0) {
            // TODO: make the length of the data a field of the ContentItem
            // interface to be able to filter out empty items in the canEnhance
            // method
            log.warn("ContentPart {} of ContentItem {} does not contain any text" + "to extract knowledge from in ContentItem {}", contentPart.getKey(), ci);
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}", new Object[] { contentPart.getKey(), ci.getUri().getUnicodeString(), StringUtils.abbreviate(text, 100) });
    try {
        if (config.isProcessedLangage(language)) {
            for (String defaultModelType : config.getDefaultModelTypes()) {
                TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
                if (nameFinderModel == null) {
          "No NER Model for {} and language {} available!", defaultModelType, language);
                } else {
                    findNamedEntities(ci, at, text, language, nameFinderModel);
        //process for additional models
        for (String additionalModel : config.getSpecificNerModles(language)) {
            TokenNameFinderModel nameFinderModel;
            try {
                nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, additionalModel, null);
                findNamedEntities(ci, at, text, language, nameFinderModel);
            } catch (IOException e) {
                log.warn("Unable to load TokenNameFinderModel model for language '" + language + "' (model: " + additionalModel + ")", e);
            } catch (RuntimeException e) {
                log.warn("Error while creating ChunkerModel for language '" + language + "' (model: " + additionalModel + ")", e);
    } catch (Exception e) {
        if (e instanceof RuntimeException) {
            throw (RuntimeException) e;
        } else {
            throw new EngineException(this, ci, e);
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) IRI(org.apache.clerezza.commons.rdf.IRI) TokenNameFinderModel( Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException( EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) InvalidFormatException( InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) IOException(

Example 3 with TokenNameFinderModel

use of in project stanbol by apache.

the class OpenNLPTest method testLoadModelByName.

public void testLoadModelByName() throws IOException {
    TokenizerModel tokenModel = openNLP.getModel(TokenizerModel.class, "en-token.bin", null);
    SentenceModel sentModel = openNLP.getModel(SentenceModel.class, "en-sent.bin", null);
    POSModel posModel = openNLP.getModel(POSModel.class, "en-pos-maxent.bin", null);
    ChunkerModel chunkModel = openNLP.getModel(ChunkerModel.class, "en-chunker.bin", null);
    TokenNameFinderModel nerModel = openNLP.getModel(TokenNameFinderModel.class, "en-ner-person.bin", null);
    //unavailable model
    tokenModel = openNLP.getModel(TokenizerModel.class, "ru-token.bin", null);
Also used : TokenNameFinderModel( ChunkerModel( SentenceModel( POSModel( TokenizerModel( Test(org.junit.Test)

Example 4 with TokenNameFinderModel

use of in project stanbol by apache.

the class OpenNLPTest method testLoadMissingNER.

public void testLoadMissingNER() throws IOException {
    //first unknown type
    TokenNameFinderModel model = openNLP.getNameModel("person2", "en");
    TokenNameFinder ner = openNLP.getNameFinder("person2", "en");
    //unknown language
    model = openNLP.getNameModel("person", "ru");
    ner = openNLP.getNameFinder("person", "ru");
Also used : TokenNameFinder( TokenNameFinderModel( Test(org.junit.Test)

Example 5 with TokenNameFinderModel

use of in project textdb by TextDB.

the class NameFinderExample method main.

public static void main(String[] args) throws IOException {
    String dataFile = "./src/main/resources/abstract_100.txt";
    Scanner scan = new Scanner(new File(dataFile));
    InputStream is = new FileInputStream("./src/main/java/edu/uci/ics/textdb/sandbox/OpenNLPexample/en-ner-location.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(is);
    NameFinderME nameFinder = new NameFinderME(model);
    int counter = 0;
    PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "sent");
    while (scan.hasNextLine()) {
        String[] sentence = Tokenize(scan.nextLine());
        Span[] spans = nameFinder.find(sentence);
        //Print out the tokens of the sentence
        if (spans.length != 0) {
            for (String s : sentence) {
                System.out.print("[" + s + "] ");
        //Print out the offset of each 
        for (Span s : spans) {
            for (int i = s.getStart(); i < s.getEnd(); i++) {
        if (spans.length != 0)
    System.out.println("Number of Results: " + counter);
Also used : Scanner(java.util.Scanner) TokenNameFinderModel( FileInputStream( InputStream( NameFinderME( PerformanceMonitor( File( Span( FileInputStream(


TokenNameFinderModel ( NameFinderME ( Test (org.junit.Test)3 IOException ( TokenNameFinder ( Span ( File ( FileInputStream ( InputStream ( URISyntaxException ( Scanner (java.util.Scanner)1 ChunkerModel ( PerformanceMonitor ( POSModel ( SentenceModel ( TokenizerModel ( InvalidFormatException ( IRI (org.apache.clerezza.commons.rdf.IRI)1 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)1 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)1