Search in sources :

Example 1 with EntityDataIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.

the class RdfIndexingSourceTest method testBlankNodeSupport.

@Test
public void testBlankNodeSupport() {
    log.info(" --- testBlankNodeSupport ---");
    String testName = "bnode";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityDataIterable iterable = config.getDataIterable();
    assertNotNull(iterable);
    assertEquals(iterable.getClass(), RdfIndexingSource.class);
    assertTrue(iterable.needsInitialisation());
    iterable.initialise();
    // ((RdfIndexingSource)iterable).debug();
    EntityDataIterator it = iterable.entityDataIterator();
    long count = 0;
    while (it.hasNext()) {
        String entity = it.next();
        log.info("validate Entity " + entity);
        assertNotNull(entity);
        validateRepresentation(it.getRepresentation(), entity);
        count++;
    }
    // check if all entities where indexed
    // Expected are 3 entities First France from france.rdf
    // and two from BlankNode Entities in bnode.nt
    assertEquals(String.format("> %s Entities expected but only %s processed!", 3, count), 3, count);
}
Also used : IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityDataIterable(org.apache.stanbol.entityhub.indexing.core.EntityDataIterable) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) Test(org.junit.Test)

Example 2 with EntityDataIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.

the class RdfIndexingSourceTest method testEntityDataIterable.

@Test
public void testEntityDataIterable() {
    log.info(" --- testEntityDataIterable ---");
    String testName = "iterable";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityDataIterable iterable = config.getDataIterable();
    assertNotNull(iterable);
    assertEquals(iterable.getClass(), RdfIndexingSource.class);
    assertTrue(iterable.needsInitialisation());
    iterable.initialise();
    EntityDataIterator it = iterable.entityDataIterator();
    long count = 0;
    while (it.hasNext()) {
        String entity = it.next();
        log.info("validate Entity " + entity);
        assertNotNull(entity);
        validateRepresentation(it.getRepresentation(), entity);
        count++;
    }
    // check if all entities where indexed
    // this checks if more entities are indexed as listed by the
    // textEntityIDs.txt file
    assertTrue(String.format("> %s Entities expected but only %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED <= count);
}
Also used : IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityDataIterable(org.apache.stanbol.entityhub.indexing.core.EntityDataIterable) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) Test(org.junit.Test)

Example 3 with EntityDataIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.

the class RdfIndexingSource method close.

@Override
public void close() {
    // first close still active RdfEntityDataIterator instances
    for (EntityDataIterator edi : entityDataIterators) {
        edi.close();
    }
    // close connections used for LDPath and EntityDataProvider
    ungetLdPathConnection();
    ungetEntityDataProviderConnection();
    // finally shutdown the repository
    if (shutdownRepository) {
        try {
            repository.shutDown();
        } catch (RepositoryException e) {
            log.warn("Error while closing Sesame Connection", e);
        }
    }
}
Also used : RepositoryException(org.openrdf.repository.RepositoryException) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator)

Example 4 with EntityDataIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.

the class GeonamesIndexingSource method entityDataIterator.

@Override
public EntityDataIterator entityDataIterator() {
    if (!consumed) {
        consumed = true;
    } else {
        throw new IllegalStateException("This implementation supports only a" + "single Iteration of the data.");
    }
    return new EntityDataIterator() {

        Iterator<RDFTerm> resources = resourceList.iterator();

        RDFTerm r;

        LineIterator it = null;

        private String next;

        private Representation rep;

        private String getNext() {
            while ((it == null || !it.hasNext()) && resources != null && resources.hasNext()) {
                if (r != null) {
                    IOUtils.closeQuietly(r.is);
                }
                r = resources.next();
                try {
                    it = r.getEntries();
                } catch (IOException e) {
                    log.error("Unable to read RDFTerm '" + r.getName() + "' because of " + e.getMessage(), e);
                    e.printStackTrace();
                    IOUtils.closeQuietly(r.is);
                    it = null;
                }
                resources.remove();
            }
            if (it != null && it.hasNext()) {
                return it.nextLine();
            } else {
                return null;
            }
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        @Override
        public String next() {
            if (next == null) {
                next = getNext();
            }
            if (next == null) {
                throw new NoSuchElementException();
            } else {
                rep = processGeonameEntry(next);
                next = null;
                return rep.getId();
            }
        }

        @Override
        public boolean hasNext() {
            if (next == null) {
                next = getNext();
            }
            return next != null;
        }

        @Override
        public Representation getRepresentation() {
            return rep;
        }

        @Override
        public void close() {
            if (r != null) {
                IOUtils.closeQuietly(r.is);
            }
            next = null;
            it = null;
            resources = null;
        }

        /**
         * Parses the Representation from the current line.<p>
         * NOTE: this does not process alternate labels and also does not
         * lookup entities for parent codes. Those things are done now by
         * own EntityProcessors
         * @param line the line to process
         * @return the representation
         */
        private Representation processGeonameEntry(String line) {
            LineTokenizer t = new LineTokenizer(line);
            // [0] geonames id
            String id = t.next();
            Integer geoNamesId = Integer.parseInt(id);
            // create a new Doc based on the first Element (geonamesID)
            Representation doc = valueFactory.createRepresentation(new StringBuilder(GEONAMES_RESOURCE_NS).append(id).append('/').toString());
            // add the Integer id so that we do not need to parse it from the subject URI
            doc.add(GeonamesPropertyEnum.idx_id.toString(), geoNamesId);
            // add the geonames:Feature type
            doc.add(GeonamesPropertyEnum.rdf_type.toString(), getReference(GeonamesPropertyEnum.gn_Feature.toString()));
            // [1] UTF-8 name
            String utf8Label = t.next();
            // [2] ASKII Name as rdfs:label
            String askiiLabel = t.next();
            if (utf8Label == null) {
                // use ASKII label as fallback for the utf8 version
                utf8Label = askiiLabel;
            }
            doc.addNaturalText(GeonamesPropertyEnum.gn_name.toString(), utf8Label);
            // [3] Alternate Names
            // alternate names are added later during processing
            t.next();
            // addAlternateNames(geoNamesId, doc);
            // [4] lat
            doc.add(GeonamesPropertyEnum.geo_lat.toString(), new BigDecimal(t.next()));
            // [5] lon
            doc.add(GeonamesPropertyEnum.geo_long.toString(), new BigDecimal(t.next()));
            // [6] featureClass
            String featureClass = new StringBuilder(GEONAMES_ONTOLOGY_NS).append(t.next()).toString();
            doc.add(GeonamesPropertyEnum.gn_featureClass.toString(), getReference(featureClass));
            // [7] featureCode (-> need to use <featureClass>.<featureCode>!!)
            doc.add(GeonamesPropertyEnum.gn_featureCode.toString(), getReference(new StringBuilder(featureClass).append('.').append(t.next()).toString()));
            // countryCode
            // -> geonames uses here the link to an HTML Page showing the Country
            // We would like to use an Link to a SKOS:Concept representing the Country
            // ... But luckily here we need only to add the URI!
            Set<String> ccs = new HashSet<String>();
            // [8] countryCode
            String countryCode = t.next();
            if (countryCode != null) {
                // need to trim because some country codes use '  ' to indicate null!
                countryCode = countryCode.trim();
                if (countryCode.length() == 2) {
                    // Yes there are some features that are in no country!
                    ccs.add(countryCode);
                }
            }
            // [9] alternate countryCodes
            String altCc = t.next();
            if (altCc != null) {
                StringTokenizer altCcT = new StringTokenizer(altCc, ",");
                while (altCcT.hasMoreElements()) {
                    countryCode = altCcT.nextToken();
                    if (countryCode.length() == 2) {
                        ccs.add(countryCode);
                    }
                }
            }
            if (!ccs.isEmpty()) {
                doc.add(GeonamesPropertyEnum.gn_countryCode.toString(), ccs);
            }
            // [10 TO 13] Admin codes
            // first read them -> we need to consume the tokens anyway
            String[] adminCodes = new String[] { // country
            countryCode, // ADM1
            t.next(), // ADM2
            t.next(), // ADM3
            t.next(), // ADM4
            t.next() };
            // Workaround for Admin1 -> add leading '0' for single Value
            if (adminCodes[1] != null && adminCodes[1].length() < 2) {
                adminCodes[1] = '0' + adminCodes[1];
            }
            // now process the admin Codes (including the country at index 0)
            StringBuilder parentCode = new StringBuilder();
            // iterate over parent codes until the first NULL (or '00' unknown) element
            for (int i = 0; i < adminCodes.length && adminCodes[i] != null && !adminCodes[i].equals("00"); i++) {
                if (i > 0) {
                    parentCode.append('.');
                }
                // add the current (last) Element
                parentCode.append(adminCodes[i]);
                String property = i == 0 ? GeonamesPropertyEnum.idx_CC.toString() : new StringBuilder(GeonamesPropertyEnum.idx_ADM.toString()).append(i).toString();
                // add each level
                doc.add(property, parentCode.toString());
            }
            // [14] population
            String populationString = t.next();
            if (populationString != null) {
                // NOTE: we need to used Long, because of Asia (3.800.000)
                Long population = new Long(populationString);
                if (population.intValue() > 0) {
                    doc.add(GeonamesPropertyEnum.gn_population.toString(), population);
                }
            }
            // [15 TO 16] elevation and gtopo30
            String altString = t.next();
            if (altString == null) {
                // if no elevation than use the gtopo30
                altString = t.next();
            } else {
                // if there is already en elevation, than consume these entry
                t.next();
            }
            Integer alt = Integer.valueOf(altString);
            if (alt.intValue() > -9999) {
                // it looks like that -9999 is sometimes used as not known!
                doc.add(GeonamesPropertyEnum.geo_alt.toString(), alt);
            }
            // [17] time zone
            // not used
            t.next();
            // [18] mod-date
            String modDateString = t.next();
            if (modDateString != null) {
                try {
                    doc.add(GeonamesPropertyEnum.dc_date.toString(), TimeUtils.toDate(DataTypeEnum.DateTime, modDateString));
                } catch (IllegalArgumentException e) {
                    log.warn(String.format("Unable to parse modificationDate for geonamesID %s from value %s", doc.getId(), modDateString));
                }
            }
            // doc.add(GeonamesPropertyEnum.dc_creator.toString(),"http://www.geonames.org/");
            return doc;
        }
    };
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) IOException(java.io.IOException) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) LineIterator(org.apache.commons.io.LineIterator) BigDecimal(java.math.BigDecimal) StringTokenizer(java.util.StringTokenizer) LineIterator(org.apache.commons.io.LineIterator) Iterator(java.util.Iterator) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) NoSuchElementException(java.util.NoSuchElementException) HashSet(java.util.HashSet)

Example 5 with EntityDataIterator

use of org.apache.stanbol.entityhub.indexing.core.EntityDataIterator in project stanbol by apache.

the class EntityDataBasedIndexingDaemon method run.

@Override
public void run() {
    log.info("...start iterating over Entity data");
    EntityDataIterator dataIterator = dataIterable.entityDataIterator();
    while (dataIterator.hasNext()) {
        Long start = Long.valueOf(System.currentTimeMillis());
        String id = dataIterator.next();
        Representation rep = null;
        Float score;
        if (!scoreProvider.needsData()) {
            score = scoreProvider.process(id);
        } else {
            rep = dataIterator.getRepresentation();
            score = scoreProvider.process(rep);
        }
        // normalise the score
        if (normaliser != null) {
            score = normaliser.normalise(score);
        }
        if (// all entities are indexed anyway
        indexAllEntitiesState || // no score available
        score == null || score.compareTo(ScoreNormaliser.ZERO) >= 0) {
            // score >= 0
            if (rep == null) {
                rep = dataIterator.getRepresentation();
            }
            produce(rep, score, start);
        }
    // else ignore this entity
    }
    setFinished();
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator)

Aggregations

EntityDataIterator (org.apache.stanbol.entityhub.indexing.core.EntityDataIterator)6 EntityDataIterable (org.apache.stanbol.entityhub.indexing.core.EntityDataIterable)3 IndexingConfig (org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig)3 Test (org.junit.Test)3 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)2 IOException (java.io.IOException)1 BigDecimal (java.math.BigDecimal)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 NoSuchElementException (java.util.NoSuchElementException)1 StringTokenizer (java.util.StringTokenizer)1 LineIterator (org.apache.commons.io.LineIterator)1 RepositoryException (org.openrdf.repository.RepositoryException)1