use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.
the class PayloadUtils method getPayloadEncoder.
public static String getPayloadEncoder(FieldType fieldType) {
// TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
String encoder = null;
Analyzer a = fieldType.getIndexAnalyzer();
if (a instanceof TokenizerChain) {
// examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
TokenizerChain tc = (TokenizerChain) a;
TokenFilterFactory[] factories = tc.getTokenFilterFactories();
for (TokenFilterFactory factory : factories) {
if (factory instanceof DelimitedPayloadTokenFilterFactory) {
encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
break;
}
if (factory instanceof NumericPayloadTokenFilterFactory) {
// encodes using `PayloadHelper.encodeFloat(payload)`
encoder = "float";
break;
}
}
}
return encoder;
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.
the class AnalyzerFactory method toString.
@Override
public String toString() {
StringBuilder sb = new StringBuilder("AnalyzerFactory(");
if (null != name) {
sb.append("name:");
sb.append(name);
sb.append(", ");
}
if (null != positionIncrementGap) {
sb.append("positionIncrementGap:");
sb.append(positionIncrementGap);
sb.append(", ");
}
if (null != offsetGap) {
sb.append("offsetGap:");
sb.append(offsetGap);
sb.append(", ");
}
for (CharFilterFactory charFilterFactory : charFilterFactories) {
sb.append(charFilterFactory);
sb.append(", ");
}
sb.append(tokenizerFactory);
for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
sb.append(", ");
sb.append(tokenFilterFactory);
}
sb.append(')');
return sb.toString();
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.
the class TestAllAnalyzersHaveFactories method test.
public void test() throws Exception {
List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();
if (// don't waste time with abstract classes
Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() || testComponents.contains(c) || crazyComponents.contains(c) || oddlyNamedComponents.contains(c) || tokenFiltersWithoutFactory.contains(c) || // deprecated ones are typically back compat hacks
c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))) {
continue;
}
Map<String, String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
if (Tokenizer.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("Tokenizer"));
String simpleName = clazzName.substring(0, clazzName.length() - 9);
assertNotNull(TokenizerFactory.lookupClass(simpleName));
TokenizerFactory instance = null;
try {
instance = TokenizerFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
assertSame(c, instance.create().getClass());
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
} else if (TokenFilter.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("Filter"));
String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6));
assertNotNull(TokenFilterFactory.lookupClass(simpleName));
TokenFilterFactory instance = null;
try {
instance = TokenFilterFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
Class<? extends TokenStream> createdClazz = instance.create(new KeywordTokenizer()).getClass();
// only check instance if factory have wrapped at all!
if (KeywordTokenizer.class != createdClazz) {
assertSame(c, createdClazz);
}
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
} else if (CharFilter.class.isAssignableFrom(c)) {
String clazzName = c.getSimpleName();
assertTrue(clazzName.endsWith("CharFilter"));
String simpleName = clazzName.substring(0, clazzName.length() - 10);
assertNotNull(CharFilterFactory.lookupClass(simpleName));
CharFilterFactory instance = null;
try {
instance = CharFilterFactory.forName(simpleName, args);
assertNotNull(instance);
if (instance instanceof ResourceLoaderAware) {
((ResourceLoaderAware) instance).inform(loader);
}
Class<? extends Reader> createdClazz = instance.create(new StringReader("")).getClass();
// only check instance if factory have wrapped at all!
if (StringReader.class != createdClazz) {
assertSame(c, createdClazz);
}
} catch (IllegalArgumentException e) {
// TODO: For now pass because some factories have not yet a default config that always works
}
}
}
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project tika by apache.
the class AnalyzerDeserializer method buildAnalyzer.
public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
if (!value.isJsonObject()) {
throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
}
JsonObject aRoot = (JsonObject) value;
CharFilterFactory[] charFilters = new CharFilterFactory[0];
TokenizerFactory tokenizerFactory = null;
TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
for (Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
String k = e.getKey();
if (k.equals(CHAR_FILTERS)) {
charFilters = buildCharFilters(e.getValue(), analyzerName);
} else if (k.equals(TOKEN_FILTERS)) {
tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
} else if (k.equals(TOKENIZER)) {
tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
} else if (!k.equals(COMMENT)) {
throw new IllegalArgumentException("Should have one of three values here:" + CHAR_FILTERS + ", " + TOKENIZER + ", " + TOKEN_FILTERS + ". I don't recognize: " + k);
}
}
if (tokenizerFactory == null) {
throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
}
return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
}
use of org.apache.lucene.analysis.util.TokenFilterFactory in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
//start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
//build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
//Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
//NER data
List<NerData> nerList = new ArrayList<NerData>();
//the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
//required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
//Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
//Sentence detection by POS tag
if (sentStartOffset < 0) {
//the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
//add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
//POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
//NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
//write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
//NOTE that the fise:TextAnnotation are written later based on the nerList
//clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
//and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
//if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
//we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
//and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
//no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
//finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations