use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.
the class RdfIndexingSourceTest method testBlankNodeSupport.
@Test
public void testBlankNodeSupport() {
log.info(" --- testBlankNodeSupport ---");
String testName = "bnode";
IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
};
EntityDataIterable iterable = config.getDataIterable();
assertNotNull(iterable);
assertEquals(iterable.getClass(), RdfIndexingSource.class);
assertTrue(iterable.needsInitialisation());
iterable.initialise();
// ((RdfIndexingSource)iterable).debug();
EntityDataIterator it = iterable.entityDataIterator();
long count = 0;
while (it.hasNext()) {
String entity = it.next();
log.info("validate Entity " + entity);
assertNotNull(entity);
validateRepresentation(it.getRepresentation(), entity);
count++;
}
// check if all entities where indexed
// Expected are 3 entities First France from france.rdf
// and two from BlankNode Entities in bnode.nt
assertEquals(String.format("> %s Entities expected but only %s processed!", 3, count), 3, count);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.
the class RdfIndexingSourceTest method testEntityDataIterable.
@Test
public void testEntityDataIterable() {
log.info(" --- testEntityDataIterable ---");
String testName = "iterable";
IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
};
EntityDataIterable iterable = config.getDataIterable();
assertNotNull(iterable);
assertEquals(iterable.getClass(), RdfIndexingSource.class);
assertTrue(iterable.needsInitialisation());
iterable.initialise();
EntityDataIterator it = iterable.entityDataIterator();
long count = 0;
while (it.hasNext()) {
String entity = it.next();
log.info("validate Entity " + entity);
assertNotNull(entity);
validateRepresentation(it.getRepresentation(), entity);
count++;
}
// check if all entities where indexed
// this checks if more entities are indexed as listed by the
// textEntityIDs.txt file
assertTrue(String.format("> %s Entities expected but only %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED <= count);
}
use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.
the class RdfIndexingSource method close.
@Override
public void close() {
// first close still active RdfEntityDataIterator instances
for (EntityDataIterator edi : entityDataIterators) {
edi.close();
}
// close connections used for LDPath and EntityDataProvider
ungetLdPathConnection();
ungetEntityDataProviderConnection();
// finally shutdown the repository
if (shutdownRepository) {
try {
repository.shutDown();
} catch (RepositoryException e) {
log.warn("Error while closing Sesame Connection", e);
}
}
}
use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.
the class GeonamesIndexingSource method entityDataIterator.
@Override
public EntityDataIterator entityDataIterator() {
if (!consumed) {
consumed = true;
} else {
throw new IllegalStateException("This implementation supports only a" + "single Iteration of the data.");
}
return new EntityDataIterator() {
Iterator<RDFTerm> resources = resourceList.iterator();
RDFTerm r;
LineIterator it = null;
private String next;
private Representation rep;
private String getNext() {
while ((it == null || !it.hasNext()) && resources != null && resources.hasNext()) {
if (r != null) {
IOUtils.closeQuietly(r.is);
}
r = resources.next();
try {
it = r.getEntries();
} catch (IOException e) {
log.error("Unable to read RDFTerm '" + r.getName() + "' because of " + e.getMessage(), e);
e.printStackTrace();
IOUtils.closeQuietly(r.is);
it = null;
}
resources.remove();
}
if (it != null && it.hasNext()) {
return it.nextLine();
} else {
return null;
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public String next() {
if (next == null) {
next = getNext();
}
if (next == null) {
throw new NoSuchElementException();
} else {
rep = processGeonameEntry(next);
next = null;
return rep.getId();
}
}
@Override
public boolean hasNext() {
if (next == null) {
next = getNext();
}
return next != null;
}
@Override
public Representation getRepresentation() {
return rep;
}
@Override
public void close() {
if (r != null) {
IOUtils.closeQuietly(r.is);
}
next = null;
it = null;
resources = null;
}
/**
* Parses the Representation from the current line.<p>
* NOTE: this does not process alternate labels and also does not
* lookup entities for parent codes. Those things are done now by
* own EntityProcessors
* @param line the line to process
* @return the representation
*/
private Representation processGeonameEntry(String line) {
LineTokenizer t = new LineTokenizer(line);
// [0] geonames id
String id = t.next();
Integer geoNamesId = Integer.parseInt(id);
// create a new Doc based on the first Element (geonamesID)
Representation doc = valueFactory.createRepresentation(new StringBuilder(GEONAMES_RESOURCE_NS).append(id).append('/').toString());
// add the Integer id so that we do not need to parse it from the subject URI
doc.add(GeonamesPropertyEnum.idx_id.toString(), geoNamesId);
// add the geonames:Feature type
doc.add(GeonamesPropertyEnum.rdf_type.toString(), getReference(GeonamesPropertyEnum.gn_Feature.toString()));
// [1] UTF-8 name
String utf8Label = t.next();
// [2] ASKII Name as rdfs:label
String askiiLabel = t.next();
if (utf8Label == null) {
// use ASKII label as fallback for the utf8 version
utf8Label = askiiLabel;
}
doc.addNaturalText(GeonamesPropertyEnum.gn_name.toString(), utf8Label);
// [3] Alternate Names
// alternate names are added later during processing
t.next();
// addAlternateNames(geoNamesId, doc);
// [4] lat
doc.add(GeonamesPropertyEnum.geo_lat.toString(), new BigDecimal(t.next()));
// [5] lon
doc.add(GeonamesPropertyEnum.geo_long.toString(), new BigDecimal(t.next()));
// [6] featureClass
String featureClass = new StringBuilder(GEONAMES_ONTOLOGY_NS).append(t.next()).toString();
doc.add(GeonamesPropertyEnum.gn_featureClass.toString(), getReference(featureClass));
// [7] featureCode (-> need to use <featureClass>.<featureCode>!!)
doc.add(GeonamesPropertyEnum.gn_featureCode.toString(), getReference(new StringBuilder(featureClass).append('.').append(t.next()).toString()));
// countryCode
// -> geonames uses here the link to an HTML Page showing the Country
// We would like to use an Link to a SKOS:Concept representing the Country
// ... But luckily here we need only to add the URI!
Set<String> ccs = new HashSet<String>();
// [8] countryCode
String countryCode = t.next();
if (countryCode != null) {
// need to trim because some country codes use ' ' to indicate null!
countryCode = countryCode.trim();
if (countryCode.length() == 2) {
// Yes there are some features that are in no country!
ccs.add(countryCode);
}
}
// [9] alternate countryCodes
String altCc = t.next();
if (altCc != null) {
StringTokenizer altCcT = new StringTokenizer(altCc, ",");
while (altCcT.hasMoreElements()) {
countryCode = altCcT.nextToken();
if (countryCode.length() == 2) {
ccs.add(countryCode);
}
}
}
if (!ccs.isEmpty()) {
doc.add(GeonamesPropertyEnum.gn_countryCode.toString(), ccs);
}
// [10 TO 13] Admin codes
// first read them -> we need to consume the tokens anyway
String[] adminCodes = new String[] { // country
countryCode, // ADM1
t.next(), // ADM2
t.next(), // ADM3
t.next(), // ADM4
t.next() };
// Workaround for Admin1 -> add leading '0' for single Value
if (adminCodes[1] != null && adminCodes[1].length() < 2) {
adminCodes[1] = '0' + adminCodes[1];
}
// now process the admin Codes (including the country at index 0)
StringBuilder parentCode = new StringBuilder();
// iterate over parent codes until the first NULL (or '00' unknown) element
for (int i = 0; i < adminCodes.length && adminCodes[i] != null && !adminCodes[i].equals("00"); i++) {
if (i > 0) {
parentCode.append('.');
}
// add the current (last) Element
parentCode.append(adminCodes[i]);
String property = i == 0 ? GeonamesPropertyEnum.idx_CC.toString() : new StringBuilder(GeonamesPropertyEnum.idx_ADM.toString()).append(i).toString();
// add each level
doc.add(property, parentCode.toString());
}
// [14] population
String populationString = t.next();
if (populationString != null) {
// NOTE: we need to used Long, because of Asia (3.800.000)
Long population = new Long(populationString);
if (population.intValue() > 0) {
doc.add(GeonamesPropertyEnum.gn_population.toString(), population);
}
}
// [15 TO 16] elevation and gtopo30
String altString = t.next();
if (altString == null) {
// if no elevation than use the gtopo30
altString = t.next();
} else {
// if there is already en elevation, than consume these entry
t.next();
}
Integer alt = Integer.valueOf(altString);
if (alt.intValue() > -9999) {
// it looks like that -9999 is sometimes used as not known!
doc.add(GeonamesPropertyEnum.geo_alt.toString(), alt);
}
// [17] time zone
// not used
t.next();
// [18] mod-date
String modDateString = t.next();
if (modDateString != null) {
try {
doc.add(GeonamesPropertyEnum.dc_date.toString(), TimeUtils.toDate(DataTypeEnum.DateTime, modDateString));
} catch (IllegalArgumentException e) {
log.warn(String.format("Unable to parse modificationDate for geonamesID %s from value %s", doc.getId(), modDateString));
}
}
// doc.add(GeonamesPropertyEnum.dc_creator.toString(),"http://www.geonames.org/");
return doc;
}
};
}
use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.
the class EntityDataBasedIndexingDaemon method run.
@Override
public void run() {
log.info("...start iterating over Entity data");
EntityDataIterator dataIterator = dataIterable.entityDataIterator();
while (dataIterator.hasNext()) {
Long start = Long.valueOf(System.currentTimeMillis());
String id = dataIterator.next();
Representation rep = null;
Float score;
if (!scoreProvider.needsData()) {
score = scoreProvider.process(id);
} else {
rep = dataIterator.getRepresentation();
score = scoreProvider.process(rep);
}
// normalise the score
if (normaliser != null) {
score = normaliser.normalise(score);
}
if (// all entities are indexed anyway
indexAllEntitiesState || // no score available
score == null || score.compareTo(ScoreNormaliser.ZERO) >= 0) {
// score >= 0
if (rep == null) {
rep = dataIterator.getRepresentation();
}
produce(rep, score, start);
}
// else ignore this entity
}
setFinished();
}
Aggregations