use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project stanbol by apache.
the class SmartcnSentenceEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
sentences.reset();
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project stanbol by apache.
the class QueryUtils method parseWildcardQueryTerms.
/**
* Parses query terms for Wildcard queries as described in the first
* comment of STANBOL-607. <p>
* As an example the String:
* <code><pre>
* "This is a te?t for multi* Toke? Wildc\*adrd Se?rche*
* </pre></code>
* is converted in the query terms
* <code><pre>
* ["This is a","te?t","multi*","toke?","Wildc\*adrd","se?rche*"]
* </pre></code>
* NOTE: that tokens that include are converted to lower case
* @param value the value
* @param loewercaseWildcardTokens if query elements that include a wildcard
* should be converted to lower case.
* @return the query terms
* @throws IOException
*/
private static QueryTerm[] parseWildcardQueryTerms(String value, boolean loewercaseWildcardTokens) {
// This assumes that the Tokenizer does tokenize '*' and '?',
// what makes it a little bit tricky.
Tokenizer tokenizer = new ICUTokenizer(new StringReader(value), tokenizerConfig);
Matcher m = WILDCARD_QUERY_CHAR_PATTERN.matcher(value);
int next = m.find() ? m.start() + 1 : -1;
if (next < 0) {
// No wildcard
return new QueryTerm[] { new QueryTerm(value, false, true, true) };
}
ArrayList<QueryTerm> queryElements = new ArrayList<QueryTerm>(5);
int lastAdded = -1;
int lastOffset = 0;
boolean foundWildcard = false;
// Lucene tokenizer are really low level ...
try {
// starting with Solr4 reset MUST BE called before using
tokenizer.reset();
while (tokenizer.incrementToken()) {
// only interested in the start/end indexes of tokens
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
if (lastAdded < 0) {
// rest with this token
lastAdded = offset.startOffset();
}
if (foundWildcard) {
// query term.
if (offset.startOffset() > lastOffset + 1) {
// (1)
String queryElement = value.substring(lastAdded, lastOffset + 1);
if (loewercaseWildcardTokens) {
queryElement = queryElement.toLowerCase();
}
queryElements.add(new QueryTerm(queryElement, true, false, true));
// previous token consumed
lastAdded = offset.startOffset();
// set to the start of the current token
foundWildcard = false;
} else if (next != offset.endOffset()) {
// (2)
String queryElement = value.substring(lastAdded, offset.endOffset());
if (loewercaseWildcardTokens) {
queryElement = queryElement.toLowerCase();
}
queryElements.add(new QueryTerm(queryElement, true, false, true));
// consume the current token
lastAdded = -1;
foundWildcard = false;
}
}
if (next == offset.endOffset()) {
// end of current token is '*' or '?'
// search next '*', '?' in value
next = m.find() ? m.start() + 1 : -1;
// a single word
if (!foundWildcard && lastAdded < lastOffset) {
String queryElement = value.substring(lastAdded, lastOffset);
queryElements.add(new QueryTerm(queryElement, false, true, true));
lastAdded = offset.startOffset();
}
// else multiple wildcards in a single token
foundWildcard = true;
}
lastOffset = offset.endOffset();
}
} catch (IOException e) {
// StringReader can not throw IOExceptions
throw new IllegalStateException(e);
}
if (lastAdded >= 0 && lastAdded < value.length()) {
String queryElement = value.substring(lastAdded, value.length());
if (foundWildcard && loewercaseWildcardTokens) {
queryElement = queryElement.toLowerCase();
}
if (foundWildcard) {
queryElements.add(new QueryTerm(queryElement, true, false, true));
} else {
queryElements.add(new QueryTerm(queryElement, false, true, true));
}
}
return queryElements.toArray(new QueryTerm[queryElements.size()]);
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
// start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
// build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
// Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
// NER data
List<NerData> nerList = new ArrayList<NerData>();
// the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
// required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
// Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
// Sentence detection by POS tag
if (sentStartOffset < 0) {
// the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
// add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
// POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
// NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
// write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
// NOTE that the fise:TextAnnotation are written later based on the nerList
// clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
// and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
// if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
// we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
// and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
// no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
// finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project sukija by ahomansikka.
the class SuggestionFilter method filter.
@Override
protected Iterator<String> filter() {
OffsetAttribute offsetAtt = input.getAttribute(OffsetAttribute.class);
if (LOG.isDebugEnabled())
LOG.debug("Word-f A " + word + " " + termAtt.toString() + " " + offsetAtt.startOffset() + " " + offsetAtt.endOffset() + " " + Constants.toString(flagsAtt));
if (hasFlag(flagsAtt, LATEX_HYPHEN)) {
word = word.replace("\\-", "");
}
final int n = word.lastIndexOf('-');
if (n > 0) {
Constants.addFlags(flagsAtt, Constants.COMPOUND_WORD);
} else {
Constants.removeFlags(flagsAtt, Constants.COMPOUND_WORD);
}
if (LOG.isDebugEnabled())
LOG.debug("Word-f B " + word + " " + termAtt.toString() + " " + Constants.toString(flagsAtt));
if (hasFlag(flagsAtt, Constants.COMPOUND_WORD)) {
if (AnalysisUtils.analyze(voikko, word, voikkoAtt, baseFormAtt, flagsAtt)) {
return baseFormAtt.getBaseForms().iterator();
} else {
//
if (LOG.isDebugEnabled())
LOG.debug("Word-f C " + word + " " + termAtt.toString() + " " + Constants.toString(flagsAtt));
final String START = word.substring(0, n);
final String END = word.substring(n + 1);
if (LOG.isDebugEnabled())
LOG.debug("Word-f D " + word + " " + termAtt.toString() + " " + Constants.toString(flagsAtt) + " [" + START + "] " + END);
Set<String> baseForms = new HashSet<String>();
Set<String> result = suggest(END);
if (LOG.isDebugEnabled())
LOG.debug("Word-f E " + END + " " + result.toString());
if (result != null) {
for (String u : result) {
baseForms.add(START + "-" + u);
baseForms.add(START + u);
}
return baseForms.iterator();
}
}
} else {
Set<String> s = suggest(word);
if (s != null) {
return s.iterator();
}
}
return null;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project zm-mailbox by Zimbra.
the class UniversalAnalyzerTest method testCJK.
private void testCJK(String src) throws IOException {
TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);
TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);
while (true) {
boolean result = cjk.incrementToken();
Assert.assertEquals(result, uni.incrementToken());
if (!result) {
break;
}
String term = cjkTermAttr.toString();
Assert.assertEquals(cjkTermAttr, uniTermAttr);
if (assertOffset) {
Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
}
Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
}
}
Aggregations