Search in sources :

Example 1 with IndexingConfig

use of org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig in project stanbol by apache.

the class RdfIndexingSourceTest method testEntityDataProvider.

@Test
public void testEntityDataProvider() {
    log.info(" --- testEntityDataProvider ---");
    String testName = "provider";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityIterator entityIdIterator = config.getEntityIdIterator();
    assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
    if (entityIdIterator.needsInitialisation()) {
        entityIdIterator.initialise();
    }
    EntityDataProvider dataProvider = config.getEntityDataProvider();
    try {
        assertNotNull(dataProvider);
        if (dataProvider.needsInitialisation()) {
            dataProvider.initialise();
        }
        assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
        long count = 0;
        while (entityIdIterator.hasNext()) {
            EntityScore entityScore = entityIdIterator.next();
            assertNotNull(entityScore);
            assertNotNull(entityScore.id);
            validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
            count++;
        }
        // check if all entities where found
        assertEquals(String.format("%s Entities expected but %s processed!", NUMBER_OF_ENTITIES_EXPECTED, count), NUMBER_OF_ENTITIES_EXPECTED, count);
    } finally {
        // we need to ensure close is called as otherwise other tests might fail
        dataProvider.close();
    }
}
Also used : EntityDataProvider(org.apache.stanbol.entityhub.indexing.core.EntityDataProvider) EntityScore(org.apache.stanbol.entityhub.indexing.core.EntityIterator.EntityScore) IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityIterator(org.apache.stanbol.entityhub.indexing.core.EntityIterator) Test(org.junit.Test)

Example 2 with IndexingConfig

use of org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig in project stanbol by apache.

the class VcardIndexingSource method setConfiguration.

@Override
public void setConfiguration(Map<String, Object> config) {
    // init fields
    IndexingConfig indexingConfig = (IndexingConfig) config.get(IndexingConfig.KEY_INDEXING_CONFIG);
    loader = new ResourceLoader(this, true, false);
    // vcard files are imported from a special folder in the destination dir.
    // this folder needs to be deleted/(re-)created first.
    vcardFileImportFolder = new File(indexingConfig.getDestinationFolder(), "vcard");
    if (vcardFileImportFolder.exists()) {
        if (vcardFileImportFolder.isDirectory()) {
            try {
                FileUtils.deleteDirectory(vcardFileImportFolder);
            } catch (IOException e) {
                throw new IllegalStateException("Unable to delete Folder " + vcardFileImportFolder.getAbsolutePath() + " containing the vCard files from a" + "previouse indexing! Please remove this folder manually.", e);
            }
        } else if (!vcardFileImportFolder.delete()) {
            throw new IllegalStateException("Unable to delete File " + vcardFileImportFolder.getAbsolutePath() + " containing the vCard data from a" + "previouse indexing! Please remove this File manually.");
        }
    }
    if (!vcardFileImportFolder.mkdirs()) {
        throw new IllegalStateException("Unable to delete Folder " + vcardFileImportFolder.getAbsolutePath() + " containing the vCard files from a" + "previouse indexing! Please remove this folder manually.");
    }
    // load config
    Object value;
    log.debug("load vcard resources from :");
    value = config.get(PARAM_SOURCE_FILE_OR_FOLDER);
    if (value == null) {
        // if not set use the default
        value = DEFAULT_SOURCE_FOLDER_NAME;
    }
    for (String source : value.toString().split(",")) {
        File sourceFileOrDirectory = indexingConfig.getSourceFile(source);
        if (sourceFileOrDirectory.exists()) {
            // register the configured source with the ResourceLoader
            this.loader.addResource(sourceFileOrDirectory);
        } else {
            if (FilenameUtils.getExtension(source).isEmpty()) {
                // RDF files.
                if (!sourceFileOrDirectory.mkdirs()) {
                    log.warn("Unable to create directory {} configured to improt source data from. " + "You will need to create this directory manually before copying the" + "Source files into it.", sourceFileOrDirectory);
                    // this would not be necessary because the directory will
                    // be empty - however I like to be consistent and have
                    // all configured and existent files & dirs added the the
                    // resource loader
                    this.loader.addResource(sourceFileOrDirectory);
                }
            } else {
                log.warn("Unable to find vcard source {} within the indexing Source folder ", source, indexingConfig.getSourceFolder());
            }
        }
    }
    if (log.isDebugEnabled()) {
        for (String registeredSource : loader.getResources(ResourceState.REGISTERED)) {
            log.debug(" > " + registeredSource);
        }
    }
    // parse the encoding
    value = config.get(PARAM_CHARSET);
    if (value != null) {
        String encoding = value.toString();
        if (encoding.isEmpty()) {
            // use plattform encoding if empty
            charset = Charset.defaultCharset();
        } else {
            try {
                charset = Charset.forName(encoding);
            } catch (RuntimeException e) {
                throw new IllegalStateException("The configured encoding '" + encoding + "' is not supported by this Plattform", e);
            }
        }
    } else {
        // use plattorm encoding if missing
        charset = Charset.defaultCharset();
    }
    // parse the prefix
    value = config.get(PARAM_PREFIX);
    if (value == null || value.toString().isEmpty()) {
        throw new IllegalStateException("Teh configuration is missing the required parameter 'prefix'!");
    } else {
        prefix = value.toString();
        // set the typeSeperatorChar based on the kind of parsed prefix
        if (prefix.endsWith("#")) {
            typeSeperatorChar = '.';
        } else if (prefix.endsWith("/")) {
            typeSeperatorChar = '/';
        } else if (prefix.endsWith(":")) {
            typeSeperatorChar = ':';
        } else if (prefix.startsWith("urn:")) {
            // maybe an urn without an tailing ':'
            prefix = prefix + ':';
            typeSeperatorChar = ':';
        } else if (prefix.indexOf("://") > 0) {
            // maybe an url without an tailing '/' or '#'
            prefix = prefix + '/';
        }
    // else ... no idea what kind of prefix ... use the default '/'
    }
}
Also used : ResourceLoader(org.apache.stanbol.entityhub.indexing.core.source.ResourceLoader) IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) IOException(java.io.IOException) File(java.io.File)

Example 3 with IndexingConfig

use of org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig in project stanbol by apache.

the class RdfIndexingSource method setConfiguration.

@Override
public void setConfiguration(Map<String, Object> config) {
    IndexingConfig indexingConfig = (IndexingConfig) config.get(IndexingConfig.KEY_INDEXING_CONFIG);
    // first init the RDF Model
    this.indexingDataset = Utils.getTDBDataset(config);
    // second we need to check if we need to import RDF files to the RDF model
    // look if we need want to use an import filter
    Object value = config.get(PARAM_IMPORT_FILTER);
    if (value == null) {
        log.info("No RDF Import Filter configured");
        importFilter = null;
    } else {
        String[] filterNames = value.toString().split(",");
        List<RdfImportFilter> filters = new ArrayList<RdfImportFilter>();
        ClassLoader cl = indexingConfig.getClass().getClassLoader();
        for (String filterName : filterNames) {
            filterName = filterName.trim();
            try {
                Class<? extends RdfImportFilter> importFilterClass = cl.loadClass(filterName).asSubclass(RdfImportFilter.class);
                RdfImportFilter filter = importFilterClass.newInstance();
                filter.setConfiguration(config);
                filters.add(filter);
                log.info("Use RDF ImportFilter {} (type: {})", importFilter, importFilterClass.getSimpleName());
            } catch (ClassNotFoundException e) {
                throw new IllegalArgumentException("Configured RdfImportFilter '" + filterName + "' not found", e);
            } catch (InstantiationException e) {
                throw new IllegalArgumentException("Configured RdfImportFilter '" + filterName + "' can not be instantiated", e);
            } catch (IllegalAccessException e) {
                throw new IllegalArgumentException("Configured RdfImportFilter '" + filterName + "' can not be created", e);
            }
        }
        if (filters.isEmpty()) {
            this.importFilter = null;
        } else if (filters.size() == 1) {
            this.importFilter = filters.get(0);
        } else {
            this.importFilter = new UnionImportFilter(filters.toArray(new RdfImportFilter[filters.size()]));
        }
    }
    boolean failOnError = indexingConfig.isFailOnError();
    // create the ResourceLoader
    this.loader = new ResourceLoader(new RdfResourceImporter(indexingDataset, importFilter), failOnError);
    value = config.get(PARAM_IMPORTED_FOLDER);
    String importedFolderName;
    if (value != null && !value.toString().isEmpty()) {
        importedFolderName = value.toString();
    } else {
        importedFolderName = DEFAULT_IMPORTED_FOLDER_NAME;
    }
    File importedFolder = new File(indexingConfig.getSourceFolder(), importedFolderName);
    log.info("Imported RDF File Folder: {}", importedFolder);
    this.loader.setImportedDir(importedFolder);
    // check if importing is deactivated
    // default is true
    boolean importSource = true;
    value = config.get(PARAM_IMPORT_SOURCE);
    if (value != null) {
        importSource = Boolean.parseBoolean(value.toString());
    }
    if (importSource) {
        // if we need to import ... check the source config
        log.info("Importing RDF data from:");
        value = config.get(PARAM_SOURCE_FILE_OR_FOLDER);
        if (value == null) {
            // if not set use the default
            value = DEFAULT_SOURCE_FOLDER_NAME;
        }
        for (String source : value.toString().split(",")) {
            File sourceFileOrDirectory = indexingConfig.getSourceFile(source);
            if (sourceFileOrDirectory.exists()) {
                // register the configured source with the ResourceLoader
                this.loader.addResource(sourceFileOrDirectory);
            } else {
                if (FilenameUtils.getExtension(source).isEmpty()) {
                    // RDF files.
                    if (!sourceFileOrDirectory.mkdirs()) {
                        log.warn("Unable to create directory {} configured to improt RDF data from. " + "You will need to create this directory manually before copying the" + "RDF files into it.", sourceFileOrDirectory);
                        // this would not be necessary because the directory will
                        // be empty - however I like to be consistent and have
                        // all configured and existent files & dirs added the the
                        // resource loader
                        this.loader.addResource(sourceFileOrDirectory);
                    }
                } else {
                    log.warn("Unable to find RDF source {} within the indexing Source folder ", source, indexingConfig.getSourceFolder());
                }
            }
        }
        if (log.isInfoEnabled()) {
            for (String registeredSource : loader.getResources(ResourceState.REGISTERED)) {
                log.info(" > " + registeredSource);
            }
        }
    } else {
        log.info("Importing RDF data deactivated by parameer {}={}" + PARAM_IMPORT_SOURCE, value);
    }
    // STANBOL-765: parsed bnode-prefix from parsed configuration.
    value = config.get(PARAM_BNODE_STATE);
    final Boolean bnodeState;
    if (value != null) {
        bnodeState = value instanceof Boolean ? (Boolean) value : Boolean.parseBoolean(value.toString());
    } else if (config.containsKey(PARAM_BNODE_STATE)) {
        // support key without value
        bnodeState = true;
    } else {
        // undefined
        bnodeState = null;
    }
    if (bnodeState == null || bnodeState) {
        // null or enabled -> consider prefix
        value = config.get(PARAM_BNODE_PREFIX);
        if (value != null) {
            try {
                new URI(value.toString());
            } catch (URISyntaxException e) {
                throw new IllegalArgumentException("The configured " + PARAM_BNODE_PREFIX + "='" + value.toString() + "' MUST BE a valid URI!");
            }
            bnodePrefix = value.toString();
        } else if (bnodeState != null) {
            // use default prefix if bnodeState is true
            bnodePrefix = String.format("urn:bnode:%s:", indexingConfig.getName());
        }
    // else bnodeState == null and no custom prefix -> disable by default
    }
    if (bnodePrefix != null) {
        log.info("Indexing of Bnodes enabled (prefix: {}", bnodePrefix);
    } else {
        log.info("Indexing of Bnodes disabled");
    }
}
Also used : ResourceLoader(org.apache.stanbol.entityhub.indexing.core.source.ResourceLoader) IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) File(java.io.File)

Example 4 with IndexingConfig

use of org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig in project stanbol by apache.

the class RdfIndexingSourceTest method testBlankNodeSupport.

@Test
public void testBlankNodeSupport() {
    log.info(" --- testBlankNodeSupport ---");
    String testName = "bnode";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityDataIterable iterable = config.getDataIterable();
    assertNotNull(iterable);
    assertEquals(iterable.getClass(), RdfIndexingSource.class);
    assertTrue(iterable.needsInitialisation());
    iterable.initialise();
    // ((RdfIndexingSource)iterable).debug();
    EntityDataIterator it = iterable.entityDataIterator();
    long count = 0;
    while (it.hasNext()) {
        String entity = it.next();
        log.info("validate Entity " + entity);
        assertNotNull(entity);
        validateRepresentation(it.getRepresentation(), entity);
        count++;
    }
    // check if all entities where indexed
    // Expected are 3 entities First France from france.rdf
    // and two from BlankNode Entities in bnode.nt
    assertEquals(String.format("> %s Entities expected but only %s processed!", 3, count), 3, count);
}
Also used : IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityDataIterable(org.apache.stanbol.entityhub.indexing.core.EntityDataIterable) EntityDataIterator(org.apache.stanbol.entityhub.indexing.core.EntityDataIterator) Test(org.junit.Test)

Example 5 with IndexingConfig

use of org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig in project stanbol by apache.

the class RdfIndexingSourceTest method testQuadsImport.

/**
 * Tests support for Quads (STANBOL-764)
 */
@Test
public void testQuadsImport() {
    log.info(" --- testQuadsImport ---");
    String testName = "quads";
    IndexingConfig config = new IndexingConfig(CONFIG_ROOT + File.separatorChar + testName, CONFIG_ROOT + '/' + testName) {
    };
    EntityIterator entityIdIterator = config.getEntityIdIterator();
    assertNotNull("Unable to perform test whithout EntityIterator", entityIdIterator);
    if (entityIdIterator.needsInitialisation()) {
        entityIdIterator.initialise();
    }
    EntityDataProvider dataProvider = config.getEntityDataProvider();
    assertNotNull(dataProvider);
    // there are test data to load
    assertTrue(dataProvider.needsInitialisation());
    dataProvider.initialise();
    assertEquals(dataProvider.getClass(), RdfIndexingSource.class);
    long count = 0;
    while (entityIdIterator.hasNext()) {
        EntityScore entityScore = entityIdIterator.next();
        assertNotNull(entityScore);
        assertNotNull(entityScore.id);
        validateRepresentation(dataProvider.getEntityData(entityScore.id), entityScore.id);
        count++;
    }
    // check if all 9 entities where imported to the default dataset
    // (and not named graphs)
    assertEquals(String.format("%s Entities expected but %s processed!", 9, count), 9, count);
}
Also used : EntityDataProvider(org.apache.stanbol.entityhub.indexing.core.EntityDataProvider) EntityScore(org.apache.stanbol.entityhub.indexing.core.EntityIterator.EntityScore) IndexingConfig(org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig) EntityIterator(org.apache.stanbol.entityhub.indexing.core.EntityIterator) Test(org.junit.Test)

Aggregations

IndexingConfig (org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig)26 Test (org.junit.Test)15 File (java.io.File)8 EntityIterator (org.apache.stanbol.entityhub.indexing.core.EntityIterator)4 EntityScore (org.apache.stanbol.entityhub.indexing.core.EntityIterator.EntityScore)4 IOException (java.io.IOException)3 EntityDataIterable (org.apache.stanbol.entityhub.indexing.core.EntityDataIterable)3 EntityDataIterator (org.apache.stanbol.entityhub.indexing.core.EntityDataIterator)3 EntityDataProvider (org.apache.stanbol.entityhub.indexing.core.EntityDataProvider)3 ResourceLoader (org.apache.stanbol.entityhub.indexing.core.source.ResourceLoader)3 FileInputStream (java.io.FileInputStream)2 InputStream (java.io.InputStream)2 NamespacePrefixService (org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService)2 ScoreNormaliser (org.apache.stanbol.entityhub.indexing.core.normaliser.ScoreNormaliser)2 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 IndexerImpl (org.apache.stanbol.entityhub.indexing.core.impl.IndexerImpl)1 EntityIneratorToScoreProviderAdapter (org.apache.stanbol.entityhub.indexing.core.source.EntityIneratorToScoreProviderAdapter)1