Search in sources :

Example 6 with StringSource

use of org.apache.stanbol.enhancer.servicesapi.impl.StringSource in project stanbol by apache.

the class AnalyzedTextSerializerAndParserTest method setup.

@BeforeClass
public static final void setup() throws IOException {
    ci = ciFactory.createContentItem(new StringSource(text));
    textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    analysedTextWithData = createAnalysedText();
    int sentence = text.indexOf('.') + 1;
    Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
    expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " + "cities such as Paris and people such as Bob Marley.");
    Token the = sent1.addToken(0, 3);
    expectedTokens.put(the, "The");
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PREP", Pos.Preposition), 0.85));
    Token stanbol = sent1.addToken(4, 11);
    expectedTokens.put(stanbol, "Stanbol");
    stanbol.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    stanbol.addAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION, Value.value(0.5));
    //use index to create Tokens
    int enhancerStart = sent1.getSpan().indexOf("enhancer");
    Token enhancer = sent1.addToken(enhancerStart, enhancerStart + "enhancer".length());
    expectedTokens.put(enhancer, "enhancer");
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("PN", Pos.ProperNoun), 0.95));
    enhancer.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("N", LexicalCategory.Noun), 0.87));
    MorphoFeatures morpho = new MorphoFeatures("enhance");
    morpho.addCase(new CaseTag("test-case-1", Case.Comitative));
    morpho.addCase(new CaseTag("test-case-2", Case.Abessive));
    morpho.addDefinitness(Definitness.Definite);
    morpho.addPerson(Person.First);
    morpho.addPos(new PosTag("PN", Pos.ProperNoun));
    morpho.addGender(new GenderTag("test-gender", Gender.Masculine));
    morpho.addNumber(new NumberTag("test-number", NumberFeature.Plural));
    morpho.addTense(new TenseTag("test-tense", Tense.Present));
    morpho.addVerbForm(new VerbMoodTag("test-verb-mood", VerbMood.ConditionalVerb));
    enhancer.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, Value.value(morpho));
    //create a chunk
    Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
    expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
    stanbolEnhancer.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("organization", DBPEDIA_ORGANISATION)));
    stanbolEnhancer.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("NP", LexicalCategory.Noun), 0.98));
}
Also used : CaseTag(org.apache.stanbol.enhancer.nlp.morpho.CaseTag) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) Token(org.apache.stanbol.enhancer.nlp.model.Token) VerbMoodTag(org.apache.stanbol.enhancer.nlp.morpho.VerbMoodTag) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) NumberTag(org.apache.stanbol.enhancer.nlp.morpho.NumberTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TenseTag(org.apache.stanbol.enhancer.nlp.morpho.TenseTag) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) GenderTag(org.apache.stanbol.enhancer.nlp.morpho.GenderTag) BeforeClass(org.junit.BeforeClass)

Example 7 with StringSource

use of org.apache.stanbol.enhancer.servicesapi.impl.StringSource in project stanbol by apache.

the class ContentSourceTest method checkMediaTypeForStringSource.

@Test
public void checkMediaTypeForStringSource() throws IOException {
    ContentSource source = new StringSource(TEST_STRING);
    assertEquals(STRING_DEFAULT_MT, source.getMediaType());
    source = new StringSource(TEST_STRING, null);
    assertEquals(STRING_DEFAULT_MT, source.getMediaType());
    source = new StringSource(TEST_STRING, UTF8, null);
    assertEquals(STRING_DEFAULT_MT, source.getMediaType());
    source = new StringSource(TEST_STRING, null, null);
    assertEquals(STRING_DEFAULT_MT, source.getMediaType());
    //this can be used to force the system default
    source = new StringSource(TEST_STRING, Charset.defaultCharset(), null);
    Map<String, String> mt = ContentItemHelper.parseMimeType(source.getMediaType());
    assertEquals("text/plain", mt.get(null));
    assertEquals(Charset.defaultCharset().name(), mt.get("charset"));
    String OTHER_MT = "text/rtf";
    source = new StringSource(TEST_STRING, OTHER_MT);
    mt = ContentItemHelper.parseMimeType(source.getMediaType());
    assertEquals(OTHER_MT, mt.get(null));
    assertEquals(UTF8.name(), mt.get("charset"));
    source = new StringSource(TEST_STRING, null, OTHER_MT);
    mt = ContentItemHelper.parseMimeType(source.getMediaType());
    assertEquals(OTHER_MT, mt.get(null));
    assertEquals(UTF8.name(), mt.get("charset"));
    Charset ISO8859_4 = Charset.forName("ISO-8859-4");
    source = new StringSource(TEST_STRING, ISO8859_4, OTHER_MT);
    mt = ContentItemHelper.parseMimeType(source.getMediaType());
    assertEquals(OTHER_MT, mt.get(null));
    assertEquals(ISO8859_4.name(), mt.get("charset"));
}
Also used : ContentSource(org.apache.stanbol.enhancer.servicesapi.ContentSource) Charset(java.nio.charset.Charset) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) Test(org.junit.Test)

Example 8 with StringSource

use of org.apache.stanbol.enhancer.servicesapi.impl.StringSource in project stanbol by apache.

the class EntityLinkingEngineTest method setUpServices.

@BeforeClass
public static void setUpServices() throws IOException {
    searcher = new TestSearcherImpl(TEST_REFERENCED_SITE_NAME, NAME, new SimpleLabelTokenizer());
    //add some terms to the searcher
    Graph graph = new IndexedGraph();
    IRI uri = new IRI("urn:test:PatrickMarshall");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Patrick Marshall")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PERSON));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:Geologist");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologist")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    graph.add(new TripleImpl(uri, REDIRECT, new IRI("urn:test:redirect:Geologist")));
    searcher.addEntity(new Entity(uri, graph));
    //a redirect
    uri = new IRI("urn:test:redirect:Geologist");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologe (redirect)")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:NewZealand");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("New Zealand")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:UniversityOfOtago");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:University");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:Otago");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    //add a 2nd Otago (Place and University
    uri = new IRI("urn:test:Otago_Texas");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago (Texas)")));
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:UniversityOfOtago_Texas");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago (Texas)")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
    searcher.addEntity(new Entity(uri, graph));
    TEST_ANALYSED_TEXT = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT)));
    TEST_ANALYSED_TEXT_WO = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
    initAnalyzedText(TEST_ANALYSED_TEXT);
    TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
    TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    initAnalyzedText(TEST_ANALYSED_TEXT_WO);
    TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
    TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) SimpleLabelTokenizer(org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.SimpleLabelTokenizer) TestSearcherImpl(org.apache.stanbol.enhancer.engines.entitylinking.impl.TestSearcherImpl) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) BeforeClass(org.junit.BeforeClass)

Example 9 with StringSource

use of org.apache.stanbol.enhancer.servicesapi.impl.StringSource in project stanbol by apache.

the class TestEntityLinkingEnhancementEngine method initContentItem.

/**
     * Creates and initialises a new content item using {@link #CONTEXT} as
     * content and 
     * @return
     * @throws IOException
     */
private ContentItem initContentItem() throws IOException {
    ContentItem ci = ciFactory.createContentItem(new IRI("urn:iks-project:enhancer:text:content-item:person"), new StringSource(CONTEXT));
    //add three text annotations to be consumed by this test
    getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
    getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
    getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
    //add the language
    ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, new PlainLiteralImpl("en")));
    return ci;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem)

Example 10 with StringSource

use of org.apache.stanbol.enhancer.servicesapi.impl.StringSource in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)26 Test (org.junit.Test)14 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)13 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)13 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)12 IRI (org.apache.clerezza.commons.rdf.IRI)10 Before (org.junit.Before)5 HashMap (java.util.HashMap)4 Graph (org.apache.clerezza.commons.rdf.Graph)4 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)4 Token (org.apache.stanbol.enhancer.nlp.model.Token)4 PosTag (org.apache.stanbol.enhancer.nlp.pos.PosTag)4 Charset (java.nio.charset.Charset)3 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)3 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)3 ContentSource (org.apache.stanbol.enhancer.servicesapi.ContentSource)3 BeforeClass (org.junit.BeforeClass)3 Chunk (org.apache.stanbol.enhancer.nlp.model.Chunk)2 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)2 Value (org.apache.stanbol.enhancer.nlp.model.annotation.Value)2