Search in sources :

Example 1 with EntityProcessor

use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.

the class FieldValueFilterTest method testExcludeConfig.

@Test
public void testExcludeConfig() {
    EntityProcessor filter = new FieldValueFilter(nsPrefixProvider, "rdf:type", "*;!foaf:Person");
    Representation r = getRepresentation(NamespaceEnum.foaf + "Person");
    Assert.assertNull(filter.process(r));
    r = getRepresentation(NamespaceEnum.skos + "Concept");
    Assert.assertNotNull(filter.process(r));
    r = getRepresentation(NamespaceEnum.skos + "Concept", NamespaceEnum.foaf + "Person");
    Assert.assertNotNull(filter.process(r));
    // test empty value
    filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "*;!null");
    Assert.assertNull(filter.process(r));
    filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "*;!");
    Assert.assertNull(filter.process(r));
    filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "*;!;!foaf:Person");
    Assert.assertNull(filter.process(r));
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) EntityProcessor(org.apache.stanbol.entityhub.indexing.core.EntityProcessor) Test(org.junit.Test)

Example 2 with EntityProcessor

use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.

the class IndexerImpl method indexEntities.

@Override
public void indexEntities() {
    synchronized (stateSync) {
        // ensure that two threads do not start the
        // initialisation at the same time ...
        State state = getState();
        if (state.ordinal() < State.INITIALISED.ordinal()) {
            throw new IllegalStateException("The Indexer MUST BE already " + State.INITIALISED + " when calling this Method!");
        }
        if (state != State.INITIALISED) {
            // ignore this call
            return;
        }
        setState(State.INDEXING);
        log.info("{}: indexing started ...", name);
    }
    // init the queues
    int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize * 2);
    BlockingQueue<QueueItem<Representation>> indexedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<Representation>> processedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<Representation>> finishedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<IndexingError>> errorEntityQueue = new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
    // Set holding all active IndexingDaemons
    final SortedSet<IndexingDaemon<?, ?>> activeIndexingDeamons = new TreeSet<IndexingDaemon<?, ?>>();
    // create the IndexingDaemos
    // TODO: Here we would need to create multiple instances in case
    // one would e.g. like to use several threads for processing entities
    // (1) the daemon reading from the IndexingSources
    String entitySourceReaderName = name + ": Entity Source Reader Deamon";
    if (entityIterator != null) {
        activeIndexingDeamons.add(new EntityIdBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, entityIterator, dataProvider, scoreNormaliser, indexAllEntitiesState));
    } else {
        activeIndexingDeamons.add(new EntityDataBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, dataIterable, scoreProvider, scoreNormaliser, indexAllEntitiesState));
    }
    // (2) The daemon for processing the entities
    activeIndexingDeamons.add(new EntityProcessorRunnable(name + ": Entity Processor Deamon", // it consumes indexed Entities
    indexedEntityQueue, // it produces processed Entities
    processedEntityQueue, errorEntityQueue, entityProcessors, Collections.singleton(SCORE_FIELD)));
    // (3) The daemon for persisting the entities
    activeIndexingDeamons.add(new EntityPersisterRunnable(name + ": Entity Perstisting Deamon", // it consumes processed Entities
    processedEntityQueue, // it produces finished Entities
    finishedEntityQueue, errorEntityQueue, chunkSize, indexingDestination.getYard()));
    // (4) The daemon for logging finished entities
    activeIndexingDeamons.add(new FinishedEntityDaemon(name + ": Finished Entity Logger Deamon", finishedEntityQueue, -1, log, indexedEntityIdOutputStream));
    // (5) The daemon for logging errors
    activeIndexingDeamons.add(new EntityErrorLoggerDaemon(name + ": Entity Error Logging Daemon", errorEntityQueue, log));
    // start indexing and wait until it has finished
    startAndWait(activeIndexingDeamons);
    // close the stream with IDs
    IOUtils.closeQuietly(indexedEntityIdOutputStream);
    // call close on all indexing components
    for (EntityProcessor ep : entityProcessors) {
        ep.close();
    }
    // set the new state to INDEXED
    setState(State.INDEXED);
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TreeSet(java.util.TreeSet) EntityProcessor(org.apache.stanbol.entityhub.indexing.core.EntityProcessor)

Example 3 with EntityProcessor

use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.

the class IndexerImpl method postProcessEntities.

@Override
public void postProcessEntities() {
    synchronized (stateSync) {
        // ensure that two threads do not start the
        // initialisation at the same time ...
        State state = getState();
        if (state.ordinal() < State.INDEXED.ordinal()) {
            throw new IllegalStateException("The Indexer MUST BE already " + State.INDEXED + " when calling this Method!");
        }
        if (state != State.INDEXED) {
            // ignore this call
            return;
        }
        setState(State.POSTPROCESSING);
        log.info("{}: PostProcessing started ...", name);
    }
    if (entityPostProcessors == null || entityPostProcessors.isEmpty()) {
        setState(State.POSTPROCESSED);
        // nothing to do
        return;
    }
    // init the post processing components
    // use an EntityDataProvider based on the indexed data
    EntityDataProvider dataProvider = new YardEntityDataProvider(indexingDestination.getYard());
    // use an LineBasedEntityIterator to iterate over the indexed entity ids
    EntityIterator entityIterator;
    try {
        entityIterator = new LineBasedEntityIterator(getEntityIdFileInputStream(), "UTF-8", null);
    } catch (IOException e) {
        throw new IllegalStateException("Unable to open file containing the " + "IDs of the indexed Entities!", e);
    }
    Map<String, Object> config = new HashMap<String, Object>();
    config.put(LineBasedEntityIterator.PARAM_ID_POS, 1);
    config.put(LineBasedEntityIterator.PARAM_SCORE_POS, Integer.MAX_VALUE);
    entityIterator.setConfiguration(config);
    // does not really make sense for processors
    for (EntityProcessor processor : entityPostProcessors) {
        if (processor.needsInitialisation()) {
            processor.initialise();
        }
    }
    // NOTE the destination needs not to be initialised -> it will be the same
    // as for indexing!
    // initialisation complete ... now setup the poet processing
    // init the queues
    int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize * 2);
    BlockingQueue<QueueItem<Representation>> indexedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<Representation>> processedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<Representation>> finishedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
    BlockingQueue<QueueItem<IndexingError>> errorEntityQueue = new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
    // Set holding all active post processing deamons
    final SortedSet<IndexingDaemon<?, ?>> activeIndexingDeamons = new TreeSet<IndexingDaemon<?, ?>>();
    // create the IndexingDaemos
    // TODO: Here we would need to create multiple instances in case
    // one would e.g. like to use several threads for processing entities
    // (1) the daemon reading from the IndexingSources
    String entitySourceReaderName = name + ": post-processing: Entity Reader Deamon";
    activeIndexingDeamons.add(new EntityIdBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, entityIterator, dataProvider, // no score normaliser
    null, // post-process all indexed entities
    true));
    // (2) The daemon for post-processing the entities
    activeIndexingDeamons.add(new EntityProcessorRunnable(name + ": post-processing: Entity Processor Deamon", // it consumes indexed Entities
    indexedEntityQueue, // it produces processed Entities
    processedEntityQueue, errorEntityQueue, entityPostProcessors, // parsed by the used LineBasedEntityIterator!
    Collections.singleton(// ensure the score not changed
    SCORE_FIELD)));
    // (3) The daemon for persisting the entities
    activeIndexingDeamons.add(new EntityPersisterRunnable(name + ": Entity Perstisting Deamon", // it consumes processed Entities
    processedEntityQueue, // it produces finished Entities
    finishedEntityQueue, errorEntityQueue, chunkSize, indexingDestination.getYard()));
    // (4) The daemon for logging finished entities
    activeIndexingDeamons.add(new FinishedEntityDaemon(name + ": Finished Entity Logger Deamon", finishedEntityQueue, -1, log, // we have already all entity ids!
    null));
    // (5) The daemon for logging errors
    activeIndexingDeamons.add(new EntityErrorLoggerDaemon(name + ": Entity Error Logging Daemon", errorEntityQueue, log));
    // start post-processing and wait until it has finished
    startAndWait(activeIndexingDeamons);
    // close all post processors
    for (EntityProcessor ep : entityPostProcessors) {
        ep.close();
    }
    setState(State.POSTPROCESSED);
}
Also used : EntityDataProvider(org.apache.stanbol.entityhub.indexing.core.EntityDataProvider) YardEntityDataProvider(org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider) HashMap(java.util.HashMap) EntityIterator(org.apache.stanbol.entityhub.indexing.core.EntityIterator) LineBasedEntityIterator(org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator) YardEntityDataProvider(org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider) LineBasedEntityIterator(org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TreeSet(java.util.TreeSet) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) IOException(java.io.IOException) IndexingDaemonEventObject(org.apache.stanbol.entityhub.indexing.core.impl.IndexingDaemon.IndexingDaemonEventObject) IndexingSourceEventObject(org.apache.stanbol.entityhub.indexing.core.impl.IndexingSourceInitialiser.IndexingSourceEventObject) EntityProcessor(org.apache.stanbol.entityhub.indexing.core.EntityProcessor)

Example 4 with EntityProcessor

use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.

the class FieldValueFilterTest method testIncludeConfig.

@Test
public void testIncludeConfig() {
    EntityProcessor filter = new FieldValueFilter(nsPrefixProvider, "rdf:type", "foaf:Person");
    Representation r = getRepresentation(NamespaceEnum.foaf + "Person");
    Assert.assertNotNull(filter.process(r));
    r = getRepresentation(NamespaceEnum.skos + "Concept");
    Assert.assertNull(filter.process(r));
    r = getRepresentation(NamespaceEnum.skos + "Concept", NamespaceEnum.foaf + "Person");
    Assert.assertNotNull(filter.process(r));
    // test empty value
    filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "");
    Assert.assertNotNull(filter.process(r));
    filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "null");
    Assert.assertNotNull(filter.process(r));
    filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", null);
    Assert.assertNotNull(filter.process(r));
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) EntityProcessor(org.apache.stanbol.entityhub.indexing.core.EntityProcessor) Test(org.junit.Test)

Example 5 with EntityProcessor

use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.

the class EntityProcessorRunnable method run.

@Override
public void run() {
    while (!isQueueFinished()) {
        QueueItem<Representation> item = consume();
        if (item != null) {
            Long start = Long.valueOf(System.currentTimeMillis());
            item.setProperty(PROCESS_STARTED, start);
            Iterator<EntityProcessor> it = processors.iterator();
            Representation processed = item.getItem();
            log.trace("> process {}", processed);
            EntityProcessor processor = null;
            while (processed != null && it.hasNext()) {
                processor = it.next();
                log.trace("   - with {}", processor);
                processed = processor.process(processed);
            }
            if (processed == null) {
                log.debug("Item {} filtered by processor {}", item.getItem().getId(), processor);
            } else {
                log.trace("   - done");
                for (String key : keys) {
                    // consume the property and add it to the
                    // transformed representation
                    Object value = item.removeProperty(key);
                    if (value != null) {
                        processed.add(key, value);
                    }
                }
                QueueItem<Representation> produced = new QueueItem<Representation>(processed, item);
                Long completed = Long.valueOf(System.currentTimeMillis());
                produced.setProperty(PROCESS_COMPLETE, completed);
                produced.setProperty(PROCESS_DURATION, Float.valueOf((float) (completed.longValue() - start.longValue())));
                produce(produced);
            }
        }
    }
    setFinished();
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) EntityProcessor(org.apache.stanbol.entityhub.indexing.core.EntityProcessor)

Aggregations

EntityProcessor (org.apache.stanbol.entityhub.indexing.core.EntityProcessor)5 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)5 TreeSet (java.util.TreeSet)2 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 EntityDataProvider (org.apache.stanbol.entityhub.indexing.core.EntityDataProvider)1 EntityIterator (org.apache.stanbol.entityhub.indexing.core.EntityIterator)1 IndexingDaemonEventObject (org.apache.stanbol.entityhub.indexing.core.impl.IndexingDaemon.IndexingDaemonEventObject)1 IndexingSourceEventObject (org.apache.stanbol.entityhub.indexing.core.impl.IndexingSourceInitialiser.IndexingSourceEventObject)1 LineBasedEntityIterator (org.apache.stanbol.entityhub.indexing.core.source.LineBasedEntityIterator)1 YardEntityDataProvider (org.apache.stanbol.entityhub.indexing.core.source.YardEntityDataProvider)1