Search in sources :

Example 26 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class DBPSpotlightAnnotateEnhancementEngine method computeEnhancements.

/**
 * Calculate the enhancements by doing a POST request to the DBpedia
 * Spotlight endpoint and processing the results
 *
 * @param ci
 *            the {@link ContentItem}
 */
public void computeEnhancements(ContentItem ci) throws EngineException {
    Language language = SpotlightEngineUtils.getContentLanguage(ci);
    String text = SpotlightEngineUtils.getPlainContent(ci);
    Collection<Annotation> dbpslGraph = doPostRequest(text, ci.getUri());
    Map<SurfaceForm, IRI> surfaceForm2TextAnnotation = new HashMap<SurfaceForm, IRI>();
    if (dbpslGraph != null) {
        // Acquire a write lock on the ContentItem when adding the
        // enhancements
        ci.getLock().writeLock().lock();
        try {
            createEnhancements(dbpslGraph, ci, text, language, surfaceForm2TextAnnotation);
            if (log.isDebugEnabled()) {
                Serializer serializer = Serializer.getInstance();
                ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
                serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
                try {
                    log.debug("DBPedia Spotlight Enhancements:\n{}", debugStream.toString("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Language(org.apache.clerezza.commons.rdf.Language) SurfaceForm(org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm) HashMap(java.util.HashMap) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Annotation(org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation) Serializer(org.apache.clerezza.rdf.core.serializedform.Serializer)

Example 27 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class ClerezzaRDFParser method handleStatement.

private void handleStatement(RDFDataset result, Triple t, Map<BlankNode, String> bNodeMap) {
    final String subject = getResourceValue(t.getSubject(), bNodeMap);
    final String predicate = getResourceValue(t.getPredicate(), bNodeMap);
    final RDFTerm object = t.getObject();
    if (object instanceof Literal) {
        final String value = ((Literal) object).getLexicalForm();
        final String language;
        final String datatype;
        datatype = getResourceValue(((Literal) object).getDataType(), bNodeMap);
        Language l = ((Literal) object).getLanguage();
        if (l == null) {
            language = null;
        } else {
            language = l.toString();
        }
        result.addTriple(subject, predicate, value, datatype, language);
        count++;
    } else {
        result.addTriple(subject, predicate, getResourceValue((BlankNodeOrIRI) object, bNodeMap));
        count++;
    }
}
Also used : Language(org.apache.clerezza.commons.rdf.Language) Literal(org.apache.clerezza.commons.rdf.Literal) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm)

Example 28 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class IndexedGraph method compare.

/**
 * Compares Resources to correctly sort them within the index.<p>
 * Sort criteria are:<ol>
 * <li> URIs are sorted by the {@link IRI#getUnicodeString()} unicode
 * string)
 * <li> Literals
 * <ol>
 * <li> sort by the {@link Literal#getLexicalForm() lixical form}
 * <li> sort by {@link Literal#getLanguage() language}
 * (<code>null</code> value first)
 * <li> sort by {@link Literal#getDataType() type} (<code>null</code>
 * value fist
 * </ol>
 * <li> BlankNode
 * <ol>
 * <li> sorted by their
 * {@link System#identityHashCode(Object) Object hasCode}
 * <li> on hasCode conflicts (same hasCode but not equals) a random order is
 * chosen and kept in the parsed conflictsMap
 * </ol>
 * </ol>
 * <b>NOTEs</b><ul>
 * <li> parsed {@link RDFTerm} are not required to correctly implement
 * {@link Object#hashCode() hashCode} and
 * {@link Object#equals(Object) equals}
 * <li> parsed {@link IRI} and {@link BlankNode} and {@link Literal} MUST
 * NOT extend/implement any of the other classes/interfaces. This means that
 * an {@link IRI} MUST NOT implement {@link BlankNode} nor {@link Literal}
 * <li> parsed {@link Literal}s MAY implement PlainLiteral AND
 * TypedLiteral. This allows wrappers over frameworks that do not
 * distinguish between those two literal types to be used with the
 * {@link IndexedGraph}.
 * </ul>
 *
 * @param a the first resource to compare
 * @param b the second resource to compare
 * @param confictsMap the map used to resolve BlankNodes with hasCode
 * conflicts
 * @return
 */
protected static int compare(RDFTerm a, RDFTerm b, Map<Integer, List<RDFTerm>> confictsMap) {
    // Handle special cases for MAX and MIN values
    if (a == MIN || b == MAX) {
        return -1;
    } else if (a == MAX || b == MIN) {
        return 1;
    }
    // sort (0) IRIs < (1) Literals (PlainLiterals & TypedLiterals) < (3) BlankNodes
    int at = a instanceof IRI ? 0 : a instanceof Literal ? 1 : 2;
    int bt = b instanceof IRI ? 0 : b instanceof Literal ? 1 : 2;
    if (at == bt) {
        // same type sort the different types
        if (at < 2) {
            // no BlankNode
            // sort in alphabetic order of the string representation
            String as = at == 0 ? ((IRI) a).getUnicodeString() : ((Literal) a).getLexicalForm();
            String bs = bt == 0 ? ((IRI) b).getUnicodeString() : ((Literal) b).getLexicalForm();
            int sc = as.compareTo(bs);
            if (sc == 0 && at == 1) {
                // same string value and Literals
                // check if the language and types are the same
                Language al = a instanceof Literal ? ((Literal) a).getLanguage() : null;
                Language bl = b instanceof Literal ? ((Literal) b).getLanguage() : null;
                // first try to sort by language
                if (al == null) {
                    sc = bl == null ? 0 : -1;
                } else if (bl == null) {
                    sc = 1;
                } else {
                    sc = al.toString().compareTo(bl.toString());
                }
                if (sc == 0) {
                    // if still equals look at the dataType
                    IRI adt = a instanceof Literal ? ((Literal) a).getDataType() : null;
                    IRI bdt = b instanceof Literal ? ((Literal) b).getDataType() : null;
                    if (adt == null) {
                        sc = bdt == null ? 0 : -1;
                    } else if (bdt == null) {
                        sc = 1;
                    } else {
                        sc = adt.getUnicodeString().compareTo(bdt.getUnicodeString());
                    }
                }
                return sc;
            } else {
                // for IRIs return the string compare
                return sc;
            }
        } else {
            // handle BlankNodes
            // sort BlankNodes based on hashCode
            int ah = a.hashCode();
            int bh = b.hashCode();
            if (ah == bh) {
                if (!a.equals(b)) {
                    // if implementations hash is the same, but the instances
                    // are not equals, try to sort them by identity hash code
                    int ash = System.identityHashCode(a);
                    int bsh = System.identityHashCode(b);
                    if (ash == bsh) {
                        // decision in a confilctMap
                        return resolveBlankNodeHashConflict(a, b, confictsMap);
                    } else {
                        return ash < bsh ? -1 : 1;
                    }
                } else {
                    // same hash and equals
                    return 0;
                }
            } else {
                // sort by hash
                return ah < bh ? -1 : 1;
            }
        }
    } else {
        return at < bt ? -1 : 1;
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Language(org.apache.clerezza.commons.rdf.Language) Literal(org.apache.clerezza.commons.rdf.Literal)

Example 29 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class ClerezzaTripleCallback method triple.

private void triple(String s, String p, String value, String datatype, String language, String graph) {
    final BlankNodeOrIRI subject = getBlankNodeOrIRI(s);
    final IRI predicate = new IRI(p);
    RDFTerm object;
    if (language != null) {
        object = new PlainLiteralImpl(value, new Language(language));
    } else if (datatype == null || RDF_LANG_STRING.equals(datatype)) {
        object = new PlainLiteralImpl(value);
    } else {
        object = new TypedLiteralImpl(value, new IRI(datatype));
    }
    mGraph.add(new TripleImpl(subject, predicate, object));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TypedLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.TypedLiteralImpl) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 30 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class KuromojiNlpEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
        throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    // start with the Tokenizer
    TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
    // build the analyzing chain by adding all TokenFilters
    for (TokenFilterFactory filterFactory : filterFactories) {
        tokenStream = filterFactory.create(tokenStream);
    }
    // Try to extract sentences based on POS tags ...
    int sentStartOffset = -1;
    // NER data
    List<NerData> nerList = new ArrayList<NerData>();
    // the next index where the NerData.context need to be set
    int nerSentIndex = 0;
    NerData ner = null;
    OffsetAttribute offset = null;
    try {
        // required with Solr 4
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            offset = tokenStream.addAttribute(OffsetAttribute.class);
            Token token = at.addToken(offset.startOffset(), offset.endOffset());
            // Get the POS attribute and init the PosTag
            PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
            PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (posTag == null) {
                posTag = adhocTags.get(posAttr.getPartOfSpeech());
                if (posTag == null) {
                    posTag = new PosTag(posAttr.getPartOfSpeech());
                    adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                    log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
                }
            }
            // Sentence detection by POS tag
            if (sentStartOffset < 0) {
                // the last token was a sentence ending
                sentStartOffset = offset.startOffset();
            }
            if (posTag.hasPos(Pos.Point)) {
                Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                // add the sentence as context to the NerData instances
                while (nerSentIndex < nerList.size()) {
                    nerList.get(nerSentIndex).context = sent.getSpan();
                    nerSentIndex++;
                }
                sentStartOffset = -1;
            }
            // POS
            token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
            // NER
            NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
                // write NER annotation
                Chunk chunk = at.addChunk(ner.start, ner.end);
                chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                // NOTE that the fise:TextAnnotation are written later based on the nerList
                // clean up
                ner = null;
            }
            if (nerTag != null) {
                if (ner == null) {
                    ner = new NerData(nerTag, offset.startOffset());
                    nerList.add(ner);
                }
                ner.end = offset.endOffset();
            }
            BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
            MorphoFeatures morpho = null;
            if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
                morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                // and add the posTag
                morpho.addPos(posTag);
            }
            InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
            inflectionAttr.getInflectionForm();
            inflectionAttr.getInflectionType();
            if (morpho != null) {
                // if present add the morpho
                token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
            }
        }
        // we still need to write the last sentence
        Sentence lastSent = null;
        if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
            lastSent = at.addSentence(sentStartOffset, offset.endOffset());
        }
        // and set the context off remaining named entities
        while (nerSentIndex < nerList.size()) {
            if (lastSent != null) {
                nerList.get(nerSentIndex).context = lastSent.getSpan();
            } else {
                // no sentence detected
                nerList.get(nerSentIndex).context = at.getSpan();
            }
            nerSentIndex++;
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
        /* ignore */
        }
    }
    // finally write the NER annotations to the metadata of the ContentItem
    final Graph metadata = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        Language lang = new Language("ja");
        for (NerData nerData : nerList) {
            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
            metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) BaseFormAttribute(org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) InflectionAttribute(org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) PartOfSpeechAttribute(org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute) IOException(java.io.IOException) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Graph(org.apache.clerezza.commons.rdf.Graph) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4