use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.
the class IndexerImpl method postProcessEntities.
@Override
public void postProcessEntities() {
synchronized (stateSync) {
//ensure that two threads do not start the
//initialisation at the same time ...
State state = getState();
if (state.ordinal() < State.INDEXED.ordinal()) {
throw new IllegalStateException("The Indexer MUST BE already " + State.INDEXED + " when calling this Method!");
}
if (state != State.INDEXED) {
// ignore this call
return;
}
setState(State.POSTPROCESSING);
log.info("{}: PostProcessing started ...", name);
}
if (entityPostProcessors == null || entityPostProcessors.isEmpty()) {
setState(State.POSTPROCESSED);
//nothing to do
return;
}
//init the post processing components
//use an EntityDataProvider based on the indexed data
EntityDataProvider dataProvider = new YardEntityDataProvider(indexingDestination.getYard());
//use an LineBasedEntityIterator to iterate over the indexed entity ids
EntityIterator entityIterator;
try {
entityIterator = new LineBasedEntityIterator(getEntityIdFileInputStream(), "UTF-8", null);
} catch (IOException e) {
throw new IllegalStateException("Unable to open file containing the " + "IDs of the indexed Entities!", e);
}
Map<String, Object> config = new HashMap<String, Object>();
config.put(LineBasedEntityIterator.PARAM_ID_POS, 1);
config.put(LineBasedEntityIterator.PARAM_SCORE_POS, Integer.MAX_VALUE);
entityIterator.setConfiguration(config);
//does not really make sense for processors
for (EntityProcessor processor : entityPostProcessors) {
if (processor.needsInitialisation()) {
processor.initialise();
}
}
//NOTE the destination needs not to be initialised -> it will be the same
//as for indexing!
//initialisation complete ... now setup the poet processing
//init the queues
int queueSize = Math.max(MIN_QUEUE_SIZE, chunkSize * 2);
BlockingQueue<QueueItem<Representation>> indexedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<Representation>> processedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<Representation>> finishedEntityQueue = new ArrayBlockingQueue<QueueItem<Representation>>(queueSize);
BlockingQueue<QueueItem<IndexingError>> errorEntityQueue = new ArrayBlockingQueue<QueueItem<IndexingError>>(queueSize);
//Set holding all active post processing deamons
final SortedSet<IndexingDaemon<?, ?>> activeIndexingDeamons = new TreeSet<IndexingDaemon<?, ?>>();
//create the IndexingDaemos
//TODO: Here we would need to create multiple instances in case
// one would e.g. like to use several threads for processing entities
//(1) the daemon reading from the IndexingSources
String entitySourceReaderName = name + ": post-processing: Entity Reader Deamon";
activeIndexingDeamons.add(new EntityIdBasedIndexingDaemon(entitySourceReaderName, indexedEntityQueue, errorEntityQueue, entityIterator, dataProvider, //no score normaliser
null, //post-process all indexed entities
true));
//(2) The daemon for post-processing the entities
activeIndexingDeamons.add(new EntityProcessorRunnable(name + ": post-processing: Entity Processor Deamon", //it consumes indexed Entities
indexedEntityQueue, //it produces processed Entities
processedEntityQueue, errorEntityQueue, entityPostProcessors, // parsed by the used LineBasedEntityIterator!
Collections.singleton(//ensure the score not changed
SCORE_FIELD)));
//(3) The daemon for persisting the entities
activeIndexingDeamons.add(new EntityPersisterRunnable(name + ": Entity Perstisting Deamon", //it consumes processed Entities
processedEntityQueue, //it produces finished Entities
finishedEntityQueue, errorEntityQueue, chunkSize, indexingDestination.getYard()));
//(4) The daemon for logging finished entities
activeIndexingDeamons.add(new FinishedEntityDaemon(name + ": Finished Entity Logger Deamon", finishedEntityQueue, -1, log, //we have already all entity ids!
null));
//(5) The daemon for logging errors
activeIndexingDeamons.add(new EntityErrorLoggerDaemon(name + ": Entity Error Logging Daemon", errorEntityQueue, log));
//start post-processing and wait until it has finished
startAndWait(activeIndexingDeamons);
//close all post processors
for (EntityProcessor ep : entityPostProcessors) {
ep.close();
}
setState(State.POSTPROCESSED);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.
the class RdfIndexingSourceTest method testEntityDataProvider.
@Test
public void testEntityDataProvider() {
log.info(" --- testEntityDataProvider ---");
String testName = "provider";
IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
};
EntityIterator entityIdIterator = config.getEntityIdIterator();
assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
if (entityIdIterator.needsInitialisation()) {
entityIdIterator.initialise();
}
EntityDataProvider dataProvider = config.getEntityDataProvider();
try {
assertNotNull(dataProvider);
if (dataProvider.needsInitialisation()) {
dataProvider.initialise();
}
assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
long count = 0;
while (entityIdIterator.hasNext()) {
EntityScore entityScore = entityIdIterator.next();
assertNotNull(entityScore);
assertNotNull(entityScore.id);
validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
count++;
}
//check if all entities where found
assertEquals(String.format("%s Entities expected but %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED, count);
} finally {
//we need to ensure close is called as otherwise other tests might fail
dataProvider.close();
}
}
use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.
the class RdfIndexingSourceTest method testEntityDataProvider.
@Test
public void testEntityDataProvider() {
log.info(" --- testEntityDataProvider ---");
String testName = "provider";
IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
};
EntityIterator entityIdIterator = config.getEntityIdIterator();
assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
if (entityIdIterator.needsInitialisation()) {
entityIdIterator.initialise();
}
EntityDataProvider dataProvider = config.getEntityDataProvider();
assertNotNull(dataProvider);
//there are test data to load
assertTrue(dataProvider.needsInitialisation());
dataProvider.initialise();
assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
long count = 0;
while (entityIdIterator.hasNext()) {
EntityScore entityScore = entityIdIterator.next();
assertNotNull(entityScore);
assertNotNull(entityScore.id);
validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
count++;
}
//check if all entities where found
assertEquals(String.format("%s Entities expected but %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED, count);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.
the class RdfIndexingSourceTest method testQuadsImport.
/**
* Tests support for Quads (STANBOL-764)
*/
@Test
public void testQuadsImport() {
log.info(" --- testQuadsImport ---");
String testName = "quads";
IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
};
EntityIterator entityIdIterator = config.getEntityIdIterator();
assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
if (entityIdIterator.needsInitialisation()) {
entityIdIterator.initialise();
}
EntityDataProvider dataProvider = config.getEntityDataProvider();
assertNotNull(dataProvider);
//there are test data to load
assertTrue(dataProvider.needsInitialisation());
dataProvider.initialise();
assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
long count = 0;
while (entityIdIterator.hasNext()) {
EntityScore entityScore = entityIdIterator.next();
assertNotNull(entityScore);
assertNotNull(entityScore.id);
validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
count++;
}
//check if all 9 entities where imported to the default dataset
// (and not named graphs)
assertEquals(String.format("%s Entities expected but %s processed!", 9, count), 9, count);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityIterator in project stanbol by apache.
the class ConfigTest method testEntityIdIteratorConfig.
@Test
public void testEntityIdIteratorConfig() {
IndexingConfig config = new IndexingConfig();
EntityIterator iterator = config.getEntityIdIterator();
ScoreNormaliser normaliser = config.getNormaliser();
if (iterator.needsInitialisation()) {
iterator.initialise();
}
float lastScore = Float.MAX_VALUE;
float lastNormalisedScore = 1f;
while (iterator.hasNext()) {
EntityScore entity = iterator.next();
assertNotNull(entity);
assertNotNull(entity.id);
assertNotNull(entity.score);
//log.info("Entity: {}",entity);
assertTrue(entity.id.startsWith("http://dbpedia.org/resource/"));
float score = entity.score.floatValue();
assertTrue(score > 0);
assertTrue(score <= lastScore);
lastScore = score;
Float normalisedScore = normaliser.normalise(entity.score);
assertNotNull(normalisedScore);
float nScore = normalisedScore.floatValue();
assertTrue(nScore <= lastNormalisedScore);
if (score < 2) {
//the value of "min-score" in minincoming
log.info("score=" + score + " nScore=" + nScore);
assertTrue(nScore < 0);
return;
} else {
assertTrue(nScore > 0);
}
}
}
Aggregations