Search in sources :

Example 6 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project commons by twitter.

the class TokenTypeAttributeSerializerTest method deserialize.

private TokenType deserialize(byte[] serialized) throws IOException {
    AttributeSource attributeSource = new AttributeSource();
    TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
    TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
    serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
    ByteArrayInputStream input = new ByteArrayInputStream(serialized);
    TokenStreamSerializer.AttributeInputStream inputStream = new TokenStreamSerializer.AttributeInputStream(input);
    serializer.deserialize(inputStream, null);
    return tokenTypeAttribute.getType();
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) ByteArrayInputStream(java.io.ByteArrayInputStream) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute)

Example 7 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method analyzeTokenStream.

/**
   * Analyzes the given TokenStream, collecting the Tokens it produces.
   *
   * @param tokenStream TokenStream to analyze
   *
   * @return List of tokens produced from the TokenStream
   */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
        tokenStream.reset();
        int position = 0;
        while (tokenStream.incrementToken()) {
            position += posIncrAtt.getPositionIncrement();
            trackerAtt.setActPosition(position);
            tokens.add(tokenStream.cloneAttributes());
        }
        // TODO should we capture?
        tokenStream.end();
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }
    return tokens;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) ArrayList(java.util.ArrayList) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 8 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class MockSynonymFilter method addSynonymAndRestoreOrigToken.

private void addSynonymAndRestoreOrigToken(String synonymText, int posLen, int endOffset) {
    AttributeSource origToken = cloneAttributes();
    addSynonym(synonymText, posLen, endOffset);
    origToken.copyTo(this);
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource)

Example 9 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class FuzzyLikeThisQuery method addTerms.

private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
    if (f.queryString == null)
        return;
    final Terms terms = MultiFields.getTerms(reader, f.fieldName);
    if (terms == null) {
        return;
    }
    try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        int corpusNumDocs = reader.numDocs();
        HashSet<String> processedTerms = new HashSet<>();
        ts.reset();
        while (ts.incrementToken()) {
            String term = termAtt.toString();
            if (!processedTerms.contains(term)) {
                processedTerms.add(term);
                //maxNum variants considered for any one term
                ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM);
                float minScore = 0;
                Term startTerm = new Term(f.fieldName, term);
                AttributeSource atts = new AttributeSource();
                MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
                FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true);
                //store the df so all variants use same idf
                int df = reader.docFreq(startTerm);
                int numVariants = 0;
                int totalVariantDocFreqs = 0;
                BytesRef possibleMatch;
                BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
                while ((possibleMatch = fe.next()) != null) {
                    numVariants++;
                    totalVariantDocFreqs += fe.docFreq();
                    float score = boostAtt.getBoost();
                    if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
                        ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
                        variantsQ.insertWithOverflow(st);
                        // maintain minScore
                        minScore = variantsQ.top().score;
                    }
                    maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
                }
                if (numVariants > 0) {
                    int avgDf = totalVariantDocFreqs / numVariants;
                    if (//no direct match we can use as df for all variants
                    df == 0) {
                        //use avg df of all variants
                        df = avgDf;
                    }
                    // take the top variants (scored by edit distance) and reset the score
                    // to include an IDF factor then add to the global queue for ranking
                    // overall top query terms
                    int size = variantsQ.size();
                    for (int i = 0; i < size; i++) {
                        ScoreTerm st = variantsQ.pop();
                        st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
                        q.insertWithOverflow(st);
                    }
                }
            }
        }
        ts.end();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) AttributeSource(org.apache.lucene.util.AttributeSource) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) MaxNonCompetitiveBoostAttribute(org.apache.lucene.search.MaxNonCompetitiveBoostAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) FuzzyTermsEnum(org.apache.lucene.search.FuzzyTermsEnum) BoostAttribute(org.apache.lucene.search.BoostAttribute) MaxNonCompetitiveBoostAttribute(org.apache.lucene.search.MaxNonCompetitiveBoostAttribute) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 10 with AttributeSource

use of org.apache.lucene.util.AttributeSource in project lucene-solr by apache.

the class SimplePreAnalyzedParser method parse.

@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
    ParseResult res = new ParseResult();
    StringBuilder sb = new StringBuilder();
    char[] buf = new char[128];
    int cnt;
    while ((cnt = reader.read(buf)) > 0) {
        sb.append(buf, 0, cnt);
    }
    String val = sb.toString();
    // empty string - accept even without version number
    if (val.length() == 0) {
        return res;
    }
    // first consume the version
    int idx = val.indexOf(' ');
    if (idx == -1) {
        throw new IOException("Missing VERSION token");
    }
    String version = val.substring(0, idx);
    if (!VERSION.equals(version)) {
        throw new IOException("Unknown VERSION " + version);
    }
    val = val.substring(idx + 1);
    // then consume the optional stored part
    int tsStart = 0;
    boolean hasStored = false;
    StringBuilder storedBuf = new StringBuilder();
    if (val.charAt(0) == '=') {
        hasStored = true;
        if (val.length() > 1) {
            for (int i = 1; i < val.length(); i++) {
                char c = val.charAt(i);
                if (c == '\\') {
                    if (i < val.length() - 1) {
                        c = val.charAt(++i);
                        if (c == '=') {
                            // we recognize only \= escape in the stored part
                            storedBuf.append('=');
                        } else {
                            storedBuf.append('\\');
                            storedBuf.append(c);
                            continue;
                        }
                    } else {
                        storedBuf.append(c);
                        continue;
                    }
                } else if (c == '=') {
                    // end of stored text
                    tsStart = i + 1;
                    break;
                } else {
                    storedBuf.append(c);
                }
            }
            if (tsStart == 0) {
                // missing end-of-stored marker
                throw new IOException("Missing end marker of stored part");
            }
        } else {
            throw new IOException("Unexpected end of stored field");
        }
    }
    if (hasStored) {
        res.str = storedBuf.toString();
    }
    Tok tok = new Tok();
    StringBuilder attName = new StringBuilder();
    StringBuilder attVal = new StringBuilder();
    // parser state
    S s = S.UNDEF;
    int lastPos = 0;
    for (int i = tsStart; i < val.length(); i++) {
        char c = val.charAt(i);
        if (c == ' ') {
            // collect leftovers
            switch(s) {
                case VALUE:
                    if (attVal.length() == 0) {
                        throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
                    }
                    if (attName.length() > 0) {
                        tok.attr.put(attName.toString(), attVal.toString());
                    }
                    break;
                case // attr name without a value ?
                NAME:
                    if (attName.length() > 0) {
                        throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
                    } else {
                    // accept missing att name and value
                    }
                    break;
                case TOKEN:
                case UNDEF:
            }
            attName.setLength(0);
            attVal.setLength(0);
            if (!tok.isEmpty() || s == S.NAME) {
                AttributeSource.State state = createState(parent, tok, lastPos);
                if (state != null)
                    res.states.add(state.clone());
            }
            // reset tok
            s = S.UNDEF;
            tok.reset();
            // skip
            lastPos++;
            continue;
        }
        StringBuilder tgt = null;
        switch(s) {
            case TOKEN:
                tgt = tok.token;
                break;
            case NAME:
                tgt = attName;
                break;
            case VALUE:
                tgt = attVal;
                break;
            case UNDEF:
                tgt = tok.token;
                s = S.TOKEN;
        }
        if (c == '\\') {
            if (s == S.TOKEN)
                lastPos++;
            if (i >= val.length() - 1) {
                // end
                tgt.append(c);
                continue;
            } else {
                c = val.charAt(++i);
                switch(c) {
                    case '\\':
                    case '=':
                    case ',':
                    case ' ':
                        tgt.append(c);
                        break;
                    case 'n':
                        tgt.append('\n');
                        break;
                    case 'r':
                        tgt.append('\r');
                        break;
                    case 't':
                        tgt.append('\t');
                        break;
                    default:
                        tgt.append('\\');
                        tgt.append(c);
                        lastPos++;
                }
            }
        } else {
            // state switch
            if (c == ',') {
                if (s == S.TOKEN) {
                    s = S.NAME;
                } else if (s == S.VALUE) {
                    // end of value, start of next attr
                    if (attVal.length() == 0) {
                        throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
                    }
                    if (attName.length() > 0 && attVal.length() > 0) {
                        tok.attr.put(attName.toString(), attVal.toString());
                    }
                    // reset
                    attName.setLength(0);
                    attVal.setLength(0);
                    s = S.NAME;
                } else {
                    throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
                }
            } else if (c == '=') {
                if (s == S.NAME) {
                    s = S.VALUE;
                } else {
                    throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
                }
            } else {
                tgt.append(c);
                if (s == S.TOKEN)
                    lastPos++;
            }
        }
    }
    // collect leftovers
    if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
        // remaining attrib?
        if (s == S.VALUE) {
            if (attName.length() > 0 && attVal.length() > 0) {
                tok.attr.put(attName.toString(), attVal.toString());
            }
        }
        AttributeSource.State state = createState(parent, tok, lastPos);
        if (state != null)
            res.states.add(state.clone());
    }
    return res;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) ParseResult(org.apache.solr.schema.PreAnalyzedField.ParseResult) State(org.apache.lucene.util.AttributeSource.State) IOException(java.io.IOException)

Aggregations

AttributeSource (org.apache.lucene.util.AttributeSource)10 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 BytesRef (org.apache.lucene.util.BytesRef)3 TokenTypeAttribute (com.twitter.common.text.token.attribute.TokenTypeAttribute)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)2 Terms (org.apache.lucene.index.Terms)2 BoostAttribute (org.apache.lucene.search.BoostAttribute)2 FuzzyTermsEnum (org.apache.lucene.search.FuzzyTermsEnum)2 MaxNonCompetitiveBoostAttribute (org.apache.lucene.search.MaxNonCompetitiveBoostAttribute)2 NamedList (org.apache.solr.common.util.NamedList)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 HashSet (java.util.HashSet)1 List (java.util.List)1 PriorityQueue (java.util.PriorityQueue)1