Examples with AttributeValueSets - org.carrot2.util.attribute.AttributeValueSets

Example 1 with AttributeValueSets

use of org.carrot2.util.attribute.AttributeValueSets in project lucene-solr by apache.

the class CarrotClusteringEngine method init.

@Override
@SuppressWarnings("rawtypes")
public String init(NamedList config, final SolrCore core) {
    this.core = core;
    String result = super.init(config, core);
    final SolrParams initParams = SolrParams.toSolrParams(config);
    // Initialization attributes for Carrot2 controller.
    HashMap<String, Object> initAttributes = new HashMap<>();
    // Customize Carrot2's resource lookup to first look for resources
    // using Solr's resource loader. If that fails, try loading from the classpath.
    ResourceLookup resourceLookup = new ResourceLookup(// Solr-specific resource loading.
    new SolrResourceLocator(core, initParams), // Using the class loader directly because this time we want to omit the prefix
    new ClassLoaderLocator(core.getResourceLoader().getClassLoader()));
    DefaultLexicalDataFactoryDescriptor.attributeBuilder(initAttributes).resourceLookup(resourceLookup);
    // Make sure the requested Carrot2 clustering algorithm class is available
    String carrotAlgorithmClassName = initParams.get(CarrotParams.ALGORITHM);
    try {
        this.clusteringAlgorithmClass = core.getResourceLoader().findClass(carrotAlgorithmClassName, IClusteringAlgorithm.class);
    } catch (SolrException s) {
        if (!(s.getCause() instanceof ClassNotFoundException)) {
            throw s;
        }
    }
    // Load Carrot2-Workbench exported attribute XMLs based on the 'name' attribute
    // of this component. This by-name convention lookup is used to simplify configuring algorithms.
    String componentName = initParams.get(ClusteringEngine.ENGINE_NAME);
    log.info("Initializing Clustering Engine '" + MoreObjects.firstNonNull(componentName, "<no 'name' attribute>") + "'");
    if (!Strings.isNullOrEmpty(componentName)) {
        IResource[] attributeXmls = resourceLookup.getAll(componentName + "-attributes.xml");
        if (attributeXmls.length > 0) {
            if (attributeXmls.length > 1) {
                log.warn("More than one attribute file found, first one will be used: " + Arrays.toString(attributeXmls));
            }
            Thread ct = Thread.currentThread();
            ClassLoader prev = ct.getContextClassLoader();
            try {
                ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
                AttributeValueSets avs = AttributeValueSets.deserialize(attributeXmls[0].open());
                AttributeValueSet defaultSet = avs.getDefaultAttributeValueSet();
                initAttributes.putAll(defaultSet.getAttributeValues());
            } catch (Exception e) {
                throw new SolrException(ErrorCode.SERVER_ERROR, "Could not read attributes XML for clustering component: " + componentName, e);
            } finally {
                ct.setContextClassLoader(prev);
            }
        }
    }
    // Extract solrconfig attributes, they take precedence.
    extractCarrotAttributes(initParams, initAttributes);
    // Customize the stemmer and tokenizer factories. The implementations we provide here
    // are included in the code base of Solr, so that it's possible to refactor
    // the Lucene APIs the factories rely on if needed.
    // Additionally, we set a custom lexical resource factory for Carrot2 that
    // will use both Carrot2 default stop words as well as stop words from
    // the StopFilter defined on the field.
    final AttributeBuilder attributeBuilder = BasicPreprocessingPipelineDescriptor.attributeBuilder(initAttributes);
    attributeBuilder.lexicalDataFactory(SolrStopwordsCarrot2LexicalDataFactory.class);
    if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.TOKENIZER_FACTORY)) {
        attributeBuilder.tokenizerFactory(LuceneCarrot2TokenizerFactory.class);
    }
    if (!initAttributes.containsKey(BasicPreprocessingPipelineDescriptor.Keys.STEMMER_FACTORY)) {
        attributeBuilder.stemmerFactory(LuceneCarrot2StemmerFactory.class);
    }
    // Pass the schema (via the core) to SolrStopwordsCarrot2LexicalDataFactory.
    initAttributes.put("solrCore", core);
    // Carrot2 uses current thread's context class loader to get
    // certain classes (e.g. custom tokenizer/stemmer) at initialization time.
    // To make sure classes from contrib JARs are available,
    // we swap the context class loader for the time of clustering.
    Thread ct = Thread.currentThread();
    ClassLoader prev = ct.getContextClassLoader();
    try {
        ct.setContextClassLoader(core.getResourceLoader().getClassLoader());
        this.controller.init(initAttributes);
    } finally {
        ct.setContextClassLoader(prev);
    }
    SchemaField uniqueField = core.getLatestSchema().getUniqueKeyField();
    if (uniqueField == null) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, CarrotClusteringEngine.class.getSimpleName() + " requires the schema to have a uniqueKeyField");
    }
    this.idFieldName = uniqueField.getName();
    return result;
}

Also used : AttributeBuilder(org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder) HashMap(java.util.HashMap) AttributeValueSets(org.carrot2.util.attribute.AttributeValueSets) SolrException(org.apache.solr.common.SolrException) IOException(java.io.IOException) ResourceLookup(org.carrot2.util.resource.ResourceLookup) AttributeValueSet(org.carrot2.util.attribute.AttributeValueSet) SchemaField(org.apache.solr.schema.SchemaField) ClassLoaderLocator(org.carrot2.util.resource.ClassLoaderLocator) IClusteringAlgorithm(org.carrot2.core.IClusteringAlgorithm) SolrParams(org.apache.solr.common.params.SolrParams) SolrException(org.apache.solr.common.SolrException) IResource(org.carrot2.util.resource.IResource)

Aggregations

IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 SolrException (org.apache.solr.common.SolrException)1 SolrParams (org.apache.solr.common.params.SolrParams)1 SchemaField (org.apache.solr.schema.SchemaField)1 IClusteringAlgorithm (org.carrot2.core.IClusteringAlgorithm)1 AttributeBuilder (org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipelineDescriptor.AttributeBuilder)1 AttributeValueSet (org.carrot2.util.attribute.AttributeValueSet)1 AttributeValueSets (org.carrot2.util.attribute.AttributeValueSets)1 ClassLoaderLocator (org.carrot2.util.resource.ClassLoaderLocator)1 IResource (org.carrot2.util.resource.IResource)1 ResourceLookup (org.carrot2.util.resource.ResourceLookup)1