Search in sources :

Example 1 with CharSequenceReader

use of in project asterixdb by apache.

the class FunctionParser method getFunctionDecl.

public FunctionDecl getFunctionDecl(Function function) throws CompilationException {
    String functionBody = function.getFunctionBody();
    List<String> params = function.getParams();
    List<VarIdentifier> varIdentifiers = new ArrayList<VarIdentifier>();
    StringBuilder builder = new StringBuilder();
    builder.append(" use dataverse " + function.getDataverseName() + ";");
    builder.append(" declare function " + function.getName().split("@")[0]);
    boolean first = true;
    for (String param : params) {
        VarIdentifier varId = new VarIdentifier(param);
        if (first) {
            first = false;
        } else {
    IParser parser = parserFactory.createParser(new CharSequenceReader(builder));
    List<Statement> statements = parser.parse();
    FunctionDecl decl = (FunctionDecl) statements.get(1);
    return decl;
Also used : CharSequenceReader( VarIdentifier(org.apache.asterix.lang.common.struct.VarIdentifier) Statement(org.apache.asterix.lang.common.base.Statement) ArrayList(java.util.ArrayList) IParser(org.apache.asterix.lang.common.base.IParser) FunctionDecl(org.apache.asterix.lang.common.statement.FunctionDecl)

Example 2 with CharSequenceReader

use of in project tutorials by eugenp.

the class JavaXToReaderUnitTest method givenUsingCommonsIO_whenConvertingFileIntoReader_thenCorrect.

public void givenUsingCommonsIO_whenConvertingFileIntoReader_thenCorrect() throws IOException {
    final File initialFile = new File("src/test/resources/initialFile.txt");
    FileUtils.write(initialFile, "With Commons IO");
    final byte[] buffer = FileUtils.readFileToByteArray(initialFile);
    final Reader targetReader = new CharSequenceReader(new String(buffer));
Also used : CharSequenceReader( Reader( InputStreamReader( CharSequenceReader( StringReader( FileReader( File( Test(org.junit.Test)

Example 3 with CharSequenceReader

use of in project tutorials by eugenp.

the class JavaXToReaderUnitTest method givenUsingCommonsIO_whenConvertingByteArrayIntoReader_thenCorrect.

public void givenUsingCommonsIO_whenConvertingByteArrayIntoReader_thenCorrect() throws IOException {
    final byte[] initialArray = "With Commons IO".getBytes();
    final Reader targetReader = new CharSequenceReader(new String(initialArray));
Also used : CharSequenceReader( Reader( InputStreamReader( CharSequenceReader( StringReader( FileReader( Test(org.junit.Test)

Example 4 with CharSequenceReader

use of in project stanbol by apache.

the class FstLinkingEngine method tag.

 * Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
 * tagging results to the parsed tag map.
 * @param content the content to link
 * @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
 * @param session the tagging session of the text
 * @param corpus the corpus o the session to tag the content with
 * @param tags the Tags map used to store the tagging results
 * @return the time in milliseconds spent in the tag callback.
 * @throws IOException on any error while accessing the {@link SolrCore}
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
    final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
    TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
    final TokenStream tokenStream;
    final TagClusterReducer reducer;
    log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
    switch(linkingMode) {
        case // will link all tokens and search longest dominant right
            tokenStream = baseTokenStream;
            reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
        case NER:
            // this uses the NamedEntityTokenFilter as tokenStream and a
            // combination with the longest dominant right as reducer
            NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
            tokenStream = neTokenFilter;
            reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
        case LINKABLE_TOKEN:
            // this uses the LinkableTokenFilter as tokenStream
            LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
            // NOTE that the  LinkableTokenFilter implements longest dominant right
            // based on the matchable span of tags (instead of the whole span).
            reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
            tokenStream = linkableTokenFilter;
            throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
    log.debug(" - tokenStream: {}", tokenStream);
    log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
    // Now process the document
    final long[] time = new long[] { 0 };
    new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {

        protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
            long start = System.nanoTime();
            if (log.isTraceEnabled()) {
                log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
            int[] span = new int[] { startOffset, endOffset };
            Tag tag = tags.get(span);
            if (tag == null) {
                tag = new Tag(span);
                tags.put(span, tag);
            // below caches, and also flags matchDocIdsBS
            Set<Match> matches = createMatches(docIdsKey);
            if (log.isTraceEnabled()) {
                log.trace("  - {} matches", matches.size());
            long dif = System.nanoTime() - start;
            time[0] = time[0] + dif;

        // NOTE: We can not use a cache, because we need to create different
        // Match instances even for the same 'docIdsKey'. This is because
        // the same result list might get generated for different
        // surface forms in the text (e.g. if the SolrIndex is case
        // insensitive, but the linking does consider the case when
        // calculating the score). If we would use this cache Match
        // instances would be used for several occurrences in the text
        // and Match#getScore() values would get overridden when
        // processing those multiple occurrences.
        // Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
        private Set<Match> createMatches(long docIdsKey) {
            IntsRef docIds = lookupDocIds(docIdsKey);
            Set<Match> matches = new HashSet<Match>(docIds.length);
            for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
                int docId = docIds.ints[i];
                // also, flip docid in bitset
                // translates here
            return matches;
    return (int) (time[0] / 1000000);
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OpenBitSet(org.apache.lucene.util.OpenBitSet) Set(java.util.Set) OpenBitSet(org.apache.lucene.util.OpenBitSet) HashSet(java.util.HashSet) Tagger(org.opensextant.solrtexttagger.Tagger) TagClusterReducer(org.opensextant.solrtexttagger.TagClusterReducer) CharSequenceReader( NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IntsRef(org.apache.lucene.util.IntsRef)

Example 5 with CharSequenceReader

use of in project stanbol by apache.

the class SmartcnTokenizerEngine method computeEnhancements.

 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
        throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the or create an " + "JIRA issue about this.");
    if (!at.getSentences().hasNext()) {
        // no sentences  ... use this engine to detect
        // first the sentences
        TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
        try {
            while (sentences.incrementToken()) {
                OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                if (log.isTraceEnabled()) {
                    log.trace("detected {}:{}", s, s.getSpan());
        } catch (IOException e) {
            String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
            log.error(message, e);
            throw new EngineException(this, ci, message, e);
    // now the tokens
    TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
    try {
        while (tokens.incrementToken()) {
            OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
            Token t = at.addToken(offset.startOffset(), offset.endOffset());
            log.trace("detected {}", t);
    } catch (IOException e) {
        String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
        log.error(message, e);
        throw new EngineException(this, ci, message, e);
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException( WordTokenFilter( AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) CharSequenceReader( SentenceTokenizer( OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)


CharSequenceReader ( Reader ( StringReader ( Test (org.junit.Test)5 FileReader ( IOException ( InputStreamReader ( TokenStream (org.apache.lucene.analysis.TokenStream)4 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)3 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)3 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)3 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)3 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)3 File ( ArrayList (java.util.ArrayList)2 SentenceTokenizer ( Token (org.apache.stanbol.enhancer.nlp.model.Token)2 NerTag (org.apache.stanbol.enhancer.nlp.ner.NerTag)2 Device (ai.elimu.model.Device)1 ApplicationOpenedEvent (