use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.
the class FieldValueFilterTest method testExcludeConfig.
@Test
public void testExcludeConfig() {
EntityProcessor filter = new FieldValueFilter(nsPrefixProvider, "rdf:type", "*;!foaf:Person");
Representation r = getRepresentation(NamespaceEnum.foaf + "Person");
Assert.assertNull(filter.process(r));
r = getRepresentation(NamespaceEnum.skos + "Concept");
Assert.assertNotNull(filter.process(r));
r = getRepresentation(NamespaceEnum.skos + "Concept", NamespaceEnum.foaf + "Person");
Assert.assertNotNull(filter.process(r));
// test empty value
filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "*;!null");
Assert.assertNull(filter.process(r));
filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "*;!");
Assert.assertNull(filter.process(r));
filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "*;!;!foaf:Person");
Assert.assertNull(filter.process(r));
}
use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.
the class IndexerImpl method indexEntities.
@Override
public void indexEntities() {
synchronized (stateSync) {
// ensure that two threads do not start the
// initialisation at the same time ...
State state = getState();
if (state.ordinal() < State.INITIALISED.ordinal()) {
throw new IllegalStateException("The Indexer MUST BE already " + State.INITIALISED + " when calling this Method!");
}
if (state != State.INITIALISED) {
// ignore this call
return;
}
setState(State.INDEXING);
log.info("{}: indexing started ...", name);
}
// init the queues
int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize * 2);
BlockingQueue<QueueItem<Representation>> indexedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<Representation>> processedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<Representation>> finishedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<IndexingError>> errorEntityQueue = new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
// Set holding all active IndexingDaemons
final SortedSet<IndexingDaemon<?, ?>> activeIndexingDeamons = new TreeSet<IndexingDaemon<?, ?>>();
// create the IndexingDaemos
// TODO: Here we would need to create multiple instances in case
// one would e.g. like to use several threads for processing entities
// (1) the daemon reading from the IndexingSources
String entitySourceReaderName = name + ": Entity Source Reader Deamon";
if (entityIterator != null) {
activeIndexingDeamons.add(new EntityIdBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, entityIterator, dataProvider, scoreNormaliser, indexAllEntitiesState));
} else {
activeIndexingDeamons.add(new EntityDataBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, dataIterable, scoreProvider, scoreNormaliser, indexAllEntitiesState));
}
// (2) The daemon for processing the entities
activeIndexingDeamons.add(new EntityProcessorRunnable(name + ": Entity Processor Deamon", // it consumes indexed Entities
indexedEntityQueue, // it produces processed Entities
processedEntityQueue, errorEntityQueue, entityProcessors, Collections.singleton(SCORE_FIELD)));
// (3) The daemon for persisting the entities
activeIndexingDeamons.add(new EntityPersisterRunnable(name + ": Entity Perstisting Deamon", // it consumes processed Entities
processedEntityQueue, // it produces finished Entities
finishedEntityQueue, errorEntityQueue, chunkSize, indexingDestination.getYard()));
// (4) The daemon for logging finished entities
activeIndexingDeamons.add(new FinishedEntityDaemon(name + ": Finished Entity Logger Deamon", finishedEntityQueue, -1, log, indexedEntityIdOutputStream));
// (5) The daemon for logging errors
activeIndexingDeamons.add(new EntityErrorLoggerDaemon(name + ": Entity Error Logging Daemon", errorEntityQueue, log));
// start indexing and wait until it has finished
startAndWait(activeIndexingDeamons);
// close the stream with IDs
IOUtils.closeQuietly(indexedEntityIdOutputStream);
// call close on all indexing components
for (EntityProcessor ep : entityProcessors) {
ep.close();
}
// set the new state to INDEXED
setState(State.INDEXED);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.
the class IndexerImpl method postProcessEntities.
@Override
public void postProcessEntities() {
synchronized (stateSync) {
// ensure that two threads do not start the
// initialisation at the same time ...
State state = getState();
if (state.ordinal() < State.INDEXED.ordinal()) {
throw new IllegalStateException("The Indexer MUST BE already " + State.INDEXED + " when calling this Method!");
}
if (state != State.INDEXED) {
// ignore this call
return;
}
setState(State.POSTPROCESSING);
log.info("{}: PostProcessing started ...", name);
}
if (entityPostProcessors == null || entityPostProcessors.isEmpty()) {
setState(State.POSTPROCESSED);
// nothing to do
return;
}
// init the post processing components
// use an EntityDataProvider based on the indexed data
EntityDataProvider dataProvider = new YardEntityDataProvider(indexingDestination.getYard());
// use an LineBasedEntityIterator to iterate over the indexed entity ids
EntityIterator entityIterator;
try {
entityIterator = new LineBasedEntityIterator(getEntityIdFileInputStream(), "UTF-8", null);
} catch (IOException e) {
throw new IllegalStateException("Unable to open file containing the " + "IDs of the indexed Entities!", e);
}
Map<String, Object> config = new HashMap<String, Object>();
config.put(LineBasedEntityIterator.PARAM_ID_POS, 1);
config.put(LineBasedEntityIterator.PARAM_SCORE_POS, Integer.MAX_VALUE);
entityIterator.setConfiguration(config);
// does not really make sense for processors
for (EntityProcessor processor : entityPostProcessors) {
if (processor.needsInitialisation()) {
processor.initialise();
}
}
// NOTE the destination needs not to be initialised -> it will be the same
// as for indexing!
// initialisation complete ... now setup the poet processing
// init the queues
int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize * 2);
BlockingQueue<QueueItem<Representation>> indexedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<Representation>> processedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<Representation>> finishedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<IndexingError>> errorEntityQueue = new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
// Set holding all active post processing deamons
final SortedSet<IndexingDaemon<?, ?>> activeIndexingDeamons = new TreeSet<IndexingDaemon<?, ?>>();
// create the IndexingDaemos
// TODO: Here we would need to create multiple instances in case
// one would e.g. like to use several threads for processing entities
// (1) the daemon reading from the IndexingSources
String entitySourceReaderName = name + ": post-processing: Entity Reader Deamon";
activeIndexingDeamons.add(new EntityIdBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, entityIterator, dataProvider, // no score normaliser
null, // post-process all indexed entities
true));
// (2) The daemon for post-processing the entities
activeIndexingDeamons.add(new EntityProcessorRunnable(name + ": post-processing: Entity Processor Deamon", // it consumes indexed Entities
indexedEntityQueue, // it produces processed Entities
processedEntityQueue, errorEntityQueue, entityPostProcessors, // parsed by the used LineBasedEntityIterator!
Collections.singleton(// ensure the score not changed
SCORE_FIELD)));
// (3) The daemon for persisting the entities
activeIndexingDeamons.add(new EntityPersisterRunnable(name + ": Entity Perstisting Deamon", // it consumes processed Entities
processedEntityQueue, // it produces finished Entities
finishedEntityQueue, errorEntityQueue, chunkSize, indexingDestination.getYard()));
// (4) The daemon for logging finished entities
activeIndexingDeamons.add(new FinishedEntityDaemon(name + ": Finished Entity Logger Deamon", finishedEntityQueue, -1, log, // we have already all entity ids!
null));
// (5) The daemon for logging errors
activeIndexingDeamons.add(new EntityErrorLoggerDaemon(name + ": Entity Error Logging Daemon", errorEntityQueue, log));
// start post-processing and wait until it has finished
startAndWait(activeIndexingDeamons);
// close all post processors
for (EntityProcessor ep : entityPostProcessors) {
ep.close();
}
setState(State.POSTPROCESSED);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.
the class FieldValueFilterTest method testIncludeConfig.
@Test
public void testIncludeConfig() {
EntityProcessor filter = new FieldValueFilter(nsPrefixProvider, "rdf:type", "foaf:Person");
Representation r = getRepresentation(NamespaceEnum.foaf + "Person");
Assert.assertNotNull(filter.process(r));
r = getRepresentation(NamespaceEnum.skos + "Concept");
Assert.assertNull(filter.process(r));
r = getRepresentation(NamespaceEnum.skos + "Concept", NamespaceEnum.foaf + "Person");
Assert.assertNotNull(filter.process(r));
// test empty value
filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "");
Assert.assertNotNull(filter.process(r));
filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", "null");
Assert.assertNotNull(filter.process(r));
filter = new FieldValueFilter(nsPrefixProvider, "skos:releated", null);
Assert.assertNotNull(filter.process(r));
}
use of org.apache.stanbol.entityhub.indexing.core.EntityProcessor in project stanbol by apache.
the class EntityProcessorRunnable method run.
@Override
public void run() {
while (!isQueueFinished()) {
QueueItem<Representation> item = consume();
if (item != null) {
Long start = Long.valueOf(System.currentTimeMillis());
item.setProperty(PROCESS_STARTED, start);
Iterator<EntityProcessor> it = processors.iterator();
Representation processed = item.getItem();
log.trace("> process {}", processed);
EntityProcessor processor = null;
while (processed != null && it.hasNext()) {
processor = it.next();
log.trace(" - with {}", processor);
processed = processor.process(processed);
}
if (processed == null) {
log.debug("Item {} filtered by processor {}", item.getItem().getId(), processor);
} else {
log.trace(" - done");
for (String key : keys) {
// consume the property and add it to the
// transformed representation
Object value = item.removeProperty(key);
if (value != null) {
processed.add(key, value);
}
}
QueueItem<Representation> produced = new QueueItem<Representation>(processed, item);
Long completed = Long.valueOf(System.currentTimeMillis());
produced.setProperty(PROCESS_COMPLETE, completed);
produced.setProperty(PROCESS_DURATION, Float.valueOf((float) (completed.longValue() - start.longValue())));
produce(produced);
}
}
}
setFinished();
}
Aggregations