Examples with EntityIterator - org.apache.stanbol.entityhub.indexing.core.EntityIterator

Example 1 with EntityIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.

the class IndexerImpl method postProcessEntities.

@Override
public void postProcessEntities() {
    synchronized (stateSync) {
        //ensure that two threads do not start the
        //initialisation at the same time ...
        State state = getState();
        if (state.ordinal() < State.INDEXED.ordinal()) {
            throw new IllegalStateException("The Indexer MUST BE already " + State.INDEXED + " when calling this Method!");
        }
        if (state != State.INDEXED) {
            // ignore this call
            return;
        }
        setState(State.POSTPROCESSING);
        log.info("{}: PostProcessing started ...", name);
    }
    if (entityPostProcessors == null || entityPostProcessors.isEmpty()) {
        setState(State.POSTPROCESSED);
        //nothing to do
        return;
    }
    //init the post processing components
    //use an EntityDataProvider based on the indexed data
    EntityDataProvider dataProvider = new YardEntityDataProvider(indexingDestination.getYard());
    //use an LineBasedEntityIterator to iterate over the indexed entity ids
    EntityIterator entityIterator;
    try {
        entityIterator = new LineBasedEntityIterator(getEntityIdFileInputStream(), "UTF-8", null);
    } catch (IOException e) {
        throw new IllegalStateException("Unable to open file containing the " + "IDs of the indexed Entities!", e);
    }
    Map<String, Object> config = new HashMap<String, Object>();
    config.put(LineBasedEntityIterator.PARAM_ID_POS, 1);
    config.put(LineBasedEntityIterator.PARAM_SCORE_POS, Integer.MAX_VALUE);
    entityIterator.setConfiguration(config);
    //does not really make sense for processors
    for (EntityProcessor processor : entityPostProcessors) {
        if (processor.needsInitialisation()) {
            processor.initialise();
        }
    }
    //NOTE the destination needs not to be initialised -> it will be the same
    //as for indexing!
    //initialisation complete ... now setup the poet processing
    //init the queues
    int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize * 2);
    BlockingQueue<QueueItem<Representation>> indexedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<Representation>> processedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<Representation>> finishedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<IndexingError>> errorEntityQueue = new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
    //Set holding all active post processing deamons
    final SortedSet<IndexingDaemon<?, ?>> activeIndexingDeamons = new TreeSet<IndexingDaemon<?, ?>>();
    //create the IndexingDaemos
    //TODO: Here we would need to create multiple instances in case
    //      one would e.g. like to use several threads for processing entities
    //(1) the daemon reading from the IndexingSources
    String entitySourceReaderName = name + ": post-processing: Entity Reader Deamon";
    activeIndexingDeamons.add(new EntityIdBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, entityIterator, dataProvider, //no score normaliser
    null, //post-process all indexed entities
    true));
    //(2) The daemon for post-processing the entities
    activeIndexingDeamons.add(new EntityProcessorRunnable(name + ": post-processing: Entity Processor Deamon", //it consumes indexed Entities
    indexedEntityQueue, //it produces processed Entities
    processedEntityQueue, errorEntityQueue, entityPostProcessors, //      parsed by the used LineBasedEntityIterator!
    Collections.singleton(//ensure the score not changed
    SCORE_FIELD)));
    //(3) The daemon for persisting the entities
    activeIndexingDeamons.add(new EntityPersisterRunnable(name + ": Entity Perstisting Deamon", //it consumes processed Entities
    processedEntityQueue, //it produces finished Entities
    finishedEntityQueue, errorEntityQueue, chunkSize, indexingDestination.getYard()));
    //(4) The daemon for logging finished entities
    activeIndexingDeamons.add(new FinishedEntityDaemon(name + ": Finished Entity Logger Deamon", finishedEntityQueue, -1, log, //we have already all entity ids!
    null));
    //(5) The daemon for logging errors
    activeIndexingDeamons.add(new EntityErrorLoggerDaemon(name + ": Entity Error Logging Daemon", errorEntityQueue, log));
    //start post-processing and wait until it has finished
    startAndWait(activeIndexingDeamons);
    //close all post processors
    for (EntityProcessor ep : entityPostProcessors) {
        ep.close();
    }
    setState(State.POSTPROCESSED);
}

Also used : EntityDataProvider(org.apache.stanbol.entityhub.indexing.core.EntityDataProvider) YardEntityDataProvider(org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider) HashMap(java.util.HashMap) EntityIterator(org.apache.stanbol.entityhub.indexing.core.EntityIterator) LineBasedEntityIterator(org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator) YardEntityDataProvider(org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider) LineBasedEntityIterator(org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TreeSet(java.util.TreeSet) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) IOException(java.io.IOException) IndexingDaemonEventObject(org.apache.stanbol.entityhub.indexing.core.impl.IndexingDaemon.IndexingDaemonEventObject) IndexingSourceEventObject(org.apache.stanbol.entityhub.indexing.core.impl.IndexingSourceInitialiser.IndexingSourceEventObject) EntityProcessor(org.apache.stanbol.entityhub.indexing.core.EntityProcessor)

Example 2 with EntityIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.

the class RdfIndexingSourceTest method testEntityDataProvider.

@Test
public void testEntityDataProvider() {
    log.info(" --- testEntityDataProvider ---");
    String testName = "provider";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityIterator entityIdIterator = config.getEntityIdIterator();
    assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
    if (entityIdIterator.needsInitialisation()) {
        entityIdIterator.initialise();
    }
    EntityDataProvider dataProvider = config.getEntityDataProvider();
    try {
        assertNotNull(dataProvider);
        if (dataProvider.needsInitialisation()) {
            dataProvider.initialise();
        }
        assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
        long count = 0;
        while (entityIdIterator.hasNext()) {
            EntityScore entityScore = entityIdIterator.next();
            assertNotNull(entityScore);
            assertNotNull(entityScore.id);
            validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
            count++;
        }
        //check if all entities where found
        assertEquals(String.format("%s Entities expected but %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED, count);
    } finally {
        //we need to ensure close is called as otherwise other tests might fail 
        dataProvider.close();
    }
}

Also used : EntityDataProvider(org.apache.stanbol.entityhub.indexing.core.EntityDataProvider) EntityScore(org.apache.stanbol.entityhub.indexing.core.EntityIterator.EntityScore) IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityIterator(org.apache.stanbol.entityhub.indexing.core.EntityIterator) Test(org.junit.Test)

Example 3 with EntityIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.

the class RdfIndexingSourceTest method testEntityDataProvider.

@Test
public void testEntityDataProvider() {
    log.info(" --- testEntityDataProvider ---");
    String testName = "provider";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityIterator entityIdIterator = config.getEntityIdIterator();
    assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
    if (entityIdIterator.needsInitialisation()) {
        entityIdIterator.initialise();
    }
    EntityDataProvider dataProvider = config.getEntityDataProvider();
    assertNotNull(dataProvider);
    //there are test data to load
    assertTrue(dataProvider.needsInitialisation());
    dataProvider.initialise();
    assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
    long count = 0;
    while (entityIdIterator.hasNext()) {
        EntityScore entityScore = entityIdIterator.next();
        assertNotNull(entityScore);
        assertNotNull(entityScore.id);
        validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
        count++;
    }
    //check if all entities where found
    assertEquals(String.format("%s Entities expected but %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED, count);
}

Example 4 with EntityIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.

the class RdfIndexingSourceTest method testQuadsImport.

/**
     * Tests support for Quads (STANBOL-764)
     */
@Test
public void testQuadsImport() {
    log.info(" --- testQuadsImport ---");
    String testName = "quads";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityIterator entityIdIterator = config.getEntityIdIterator();
    assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
    if (entityIdIterator.needsInitialisation()) {
        entityIdIterator.initialise();
    }
    EntityDataProvider dataProvider = config.getEntityDataProvider();
    assertNotNull(dataProvider);
    //there are test data to load
    assertTrue(dataProvider.needsInitialisation());
    dataProvider.initialise();
    assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
    long count = 0;
    while (entityIdIterator.hasNext()) {
        EntityScore entityScore = entityIdIterator.next();
        assertNotNull(entityScore);
        assertNotNull(entityScore.id);
        validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
        count++;
    }
    //check if all 9 entities where imported to the default dataset
    // (and not named graphs)
    assertEquals(String.format("%s Entities expected but %s processed!", 9, count), 9, count);
}

Example 5 with EntityIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.

the class ConfigTest method testEntityIdIteratorConfig.

@Test
public void testEntityIdIteratorConfig() {
    IndexingConfig config = new IndexingConfig();
    EntityIterator iterator = config.getEntityIdIterator();
    ScoreNormaliser normaliser = config.getNormaliser();
    if (iterator.needsInitialisation()) {
        iterator.initialise();
    }
    float lastScore = Float.MAX_VALUE;
    float lastNormalisedScore = 1f;
    while (iterator.hasNext()) {
        EntityScore entity = iterator.next();
        assertNotNull(entity);
        assertNotNull(entity.id);
        assertNotNull(entity.score);
        //log.info("Entity: {}",entity);
        assertTrue(entity.id.startsWith("http://dbpedia.org/resource/"));
        float score = entity.score.floatValue();
        assertTrue(score > 0);
        assertTrue(score <= lastScore);
        lastScore = score;
        Float normalisedScore = normaliser.normalise(entity.score);
        assertNotNull(normalisedScore);
        float nScore = normalisedScore.floatValue();
        assertTrue(nScore <= lastNormalisedScore);
        if (score < 2) {
            //the value of "min-score" in minincoming
            log.info("score=" + score + " nScore=" + nScore);
            assertTrue(nScore < 0);
            return;
        } else {
            assertTrue(nScore > 0);
        }
    }
}

Also used : ScoreNormaliser(org.apache.stanbol.entityhub.indexing.core.normaliser.ScoreNormaliser) EntityScore(org.apache.stanbol.entityhub.indexing.core.EntityIterator.EntityScore) IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityIterator(org.apache.stanbol.entityhub.indexing.core.EntityIterator) Test(org.junit.Test)

Aggregations

EntityIterator (org.apache.stanbol.entityhub.indexing.core.EntityIterator)5 EntityDataProvider (org.apache.stanbol.entityhub.indexing.core.EntityDataProvider)4 EntityScore (org.apache.stanbol.entityhub.indexing.core.EntityIterator.EntityScore)4 IndexingConfig (org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig)4 Test (org.junit.Test)4 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 TreeSet (java.util.TreeSet)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 EntityProcessor (org.apache.stanbol.entityhub.indexing.core.EntityProcessor)1 IndexingDaemonEventObject (org.apache.stanbol.entityhub.indexing.core.impl.IndexingDaemon.IndexingDaemonEventObject)1 IndexingSourceEventObject (org.apache.stanbol.entityhub.indexing.core.impl.IndexingSourceInitialiser.IndexingSourceEventObject)1 ScoreNormaliser (org.apache.stanbol.entityhub.indexing.core.normaliser.ScoreNormaliser)1 LineBasedEntityIterator (org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator)1 YardEntityDataProvider (org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider)1 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)1