Search in sources :

Example 6 with CharSequenceReader

use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.

the class FstLinkingEngine method tag.

/**
 * Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
 * tagging results to the parsed tag map.
 * @param content the content to link
 * @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
 * @param session the tagging session of the text
 * @param corpus the corpus o the session to tag the content with
 * @param tags the Tags map used to store the tagging results
 * @return the time in milliseconds spent in the tag callback.
 * @throws IOException on any error while accessing the {@link SolrCore}
 */
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
    final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
    TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
    final TokenStream tokenStream;
    final TagClusterReducer reducer;
    log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
    switch(linkingMode) {
        case // will link all tokens and search longest dominant right
        PLAIN:
            tokenStream = baseTokenStream;
            reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
            break;
        case NER:
            // this uses the NamedEntityTokenFilter as tokenStream and a
            // combination with the longest dominant right as reducer
            NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
            tokenStream = neTokenFilter;
            reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
            break;
        case LINKABLE_TOKEN:
            // this uses the LinkableTokenFilter as tokenStream
            LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
            // NOTE that the  LinkableTokenFilter implements longest dominant right
            // based on the matchable span of tags (instead of the whole span).
            reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
            tokenStream = linkableTokenFilter;
            break;
        default:
            throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
    }
    log.debug(" - tokenStream: {}", tokenStream);
    log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
    // Now process the document
    final long[] time = new long[] { 0 };
    new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {

        @Override
        protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
            long start = System.nanoTime();
            if (log.isTraceEnabled()) {
                log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
            }
            int[] span = new int[] { startOffset, endOffset };
            Tag tag = tags.get(span);
            if (tag == null) {
                tag = new Tag(span);
                tags.put(span, tag);
            }
            // below caches, and also flags matchDocIdsBS
            Set<Match> matches = createMatches(docIdsKey);
            if (log.isTraceEnabled()) {
                log.trace("  - {} matches", matches.size());
            }
            tag.addIds(matches);
            long dif = System.nanoTime() - start;
            time[0] = time[0] + dif;
        }

        // NOTE: We can not use a cache, because we need to create different
        // Match instances even for the same 'docIdsKey'. This is because
        // the same result list might get generated for different
        // surface forms in the text (e.g. if the SolrIndex is case
        // insensitive, but the linking does consider the case when
        // calculating the score). If we would use this cache Match
        // instances would be used for several occurrences in the text
        // and Match#getScore() values would get overridden when
        // processing those multiple occurrences.
        // Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
        private Set<Match> createMatches(long docIdsKey) {
            IntsRef docIds = lookupDocIds(docIdsKey);
            Set<Match> matches = new HashSet<Match>(docIds.length);
            for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
                int docId = docIds.ints[i];
                // also, flip docid in bitset
                matchDocIdsBS.set(docId);
                // translates here
                matches.add(session.createMatch(docId));
            }
            return matches;
        }
    }.process();
    return (int) (time[0] / 1000000);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OpenBitSet(org.apache.lucene.util.OpenBitSet) Set(java.util.Set) OpenBitSet(org.apache.lucene.util.OpenBitSet) HashSet(java.util.HashSet) Tagger(org.opensextant.solrtexttagger.Tagger) TagClusterReducer(org.opensextant.solrtexttagger.TagClusterReducer) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IntsRef(org.apache.lucene.util.IntsRef)

Example 7 with CharSequenceReader

use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.

the class SmartcnTokenizerEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
        throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    if (!at.getSentences().hasNext()) {
        // no sentences  ... use this engine to detect
        // first the sentences
        TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
        try {
            while (sentences.incrementToken()) {
                OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
                Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
                if (log.isTraceEnabled()) {
                    log.trace("detected {}:{}", s, s.getSpan());
                }
            }
        } catch (IOException e) {
            String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
            log.error(message, e);
            throw new EngineException(this, ci, message, e);
        }
    }
    // now the tokens
    TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
    try {
        tokens.reset();
        while (tokens.incrementToken()) {
            OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
            Token t = at.addToken(offset.startOffset(), offset.endOffset());
            log.trace("detected {}", t);
        }
    } catch (IOException e) {
        String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
        log.error(message, e);
        throw new EngineException(this, ci, message, e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) WordTokenFilter(org.apache.lucene.analysis.cn.smart.WordTokenFilter) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) SentenceTokenizer(org.apache.lucene.analysis.cn.smart.SentenceTokenizer) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Example 8 with CharSequenceReader

use of org.apache.commons.io.input.CharSequenceReader in project thingsboard by thingsboard.

the class CsvUtils method parseCsv.

public static List<List<String>> parseCsv(String content, Character delimiter) throws Exception {
    CSVFormat csvFormat = delimiter.equals(',') ? CSVFormat.DEFAULT : CSVFormat.DEFAULT.withDelimiter(delimiter);
    List<CSVRecord> records;
    try (CharSequenceReader reader = new CharSequenceReader(content)) {
        records = csvFormat.parse(reader).getRecords();
    }
    return records.stream().map(record -> Stream.iterate(0, i -> i < record.size(), i -> i + 1).map(record::get).collect(Collectors.toList())).collect(Collectors.toList());
}
Also used : List(java.util.List) AccessLevel(lombok.AccessLevel) Stream(java.util.stream.Stream) CSVFormat(org.apache.commons.csv.CSVFormat) CSVRecord(org.apache.commons.csv.CSVRecord) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) Collectors(java.util.stream.Collectors) NoArgsConstructor(lombok.NoArgsConstructor) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) CSVFormat(org.apache.commons.csv.CSVFormat) CSVRecord(org.apache.commons.csv.CSVRecord)

Example 9 with CharSequenceReader

use of org.apache.commons.io.input.CharSequenceReader in project webapp by elimu-ai.

the class ApplicationOpenedEventRestController method create.

@RequestMapping("/create")
public String create(HttpServletRequest request, // TODO: checksum,
@RequestParam MultipartFile multipartFile) {
    logger.info("create");
    logger.info("request.getQueryString(): " + request.getQueryString());
    if (!multipartFile.isEmpty()) {
        try {
            byte[] bytes = multipartFile.getBytes();
            Reader reader = new CharSequenceReader((new String(bytes)));
            List<String> lines = IOUtils.readLines(reader);
            logger.info("lines.size(): " + lines.size());
            reader.close();
            for (String eventLine : lines) {
                logger.info("eventLine: " + eventLine);
                // Expected format: id:163|deviceId:2312aff4939750ea|time:1496843219926|packageName:ai.elimu.nyaqd|studentId:2312aff4939750ea_4
                String deviceId = EventLineHelper.getDeviceId(eventLine);
                Device device = deviceDao.read(deviceId);
                logger.info("device: " + device);
                Calendar timeOfEvent = EventLineHelper.getTime(eventLine);
                String packageName = EventLineHelper.getPackageName(eventLine);
                ApplicationOpenedEvent existingApplicationOpenedEvent = applicationOpenedEventDao.read(device, timeOfEvent, packageName);
                logger.info("existingApplicationOpenedEvent: " + existingApplicationOpenedEvent);
                if (existingApplicationOpenedEvent == null) {
                    ApplicationOpenedEvent applicationOpenedEvent = new ApplicationOpenedEvent();
                    applicationOpenedEvent.setDevice(device);
                    applicationOpenedEvent.setCalendar(timeOfEvent);
                    applicationOpenedEvent.setPackageName(packageName);
                    applicationOpenedEventDao.create(applicationOpenedEvent);
                }
            }
        } catch (IOException ex) {
            logger.error(null, ex);
        }
    }
    JSONObject jsonObject = new JSONObject();
    jsonObject.put("result", "success");
    // TODO: handle error
    logger.info("jsonObject: " + jsonObject);
    return jsonObject.toString();
}
Also used : CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) JSONObject(org.json.JSONObject) Device(ai.elimu.model.Device) Calendar(java.util.Calendar) ApplicationOpenedEvent(ai.elimu.model.analytics.ApplicationOpenedEvent) Reader(java.io.Reader) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) IOException(java.io.IOException) RequestMapping(org.springframework.web.bind.annotation.RequestMapping)

Example 10 with CharSequenceReader

use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.

the class SmartcnSentenceEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
        throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    // first the sentences
    TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
    try {
        sentences.reset();
        while (sentences.incrementToken()) {
            OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
            Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
            if (log.isTraceEnabled()) {
                log.trace("detected {}:{}", s, s.getSpan());
            }
        }
    } catch (IOException e) {
        String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
        log.error(message, e);
        throw new EngineException(this, ci, message, e);
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) TokenStream(org.apache.lucene.analysis.TokenStream) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) SentenceTokenizer(org.apache.lucene.analysis.cn.smart.SentenceTokenizer) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence)

Aggregations

CharSequenceReader (org.apache.commons.io.input.CharSequenceReader)14 Test (org.junit.Test)7 Reader (java.io.Reader)6 StringReader (java.io.StringReader)5 FileReader (java.io.FileReader)4 IOException (java.io.IOException)4 InputStreamReader (java.io.InputStreamReader)4 TokenStream (org.apache.lucene.analysis.TokenStream)4 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)3 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)3 Sentence (org.apache.stanbol.enhancer.nlp.model.Sentence)3 NlpEngineHelper.initAnalysedText (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText)3 File (java.io.File)2 StringWriter (java.io.StringWriter)2 CharBuffer (java.nio.CharBuffer)2 ArrayList (java.util.ArrayList)2 Collectors.joining (java.util.stream.Collectors.joining)2 IntStream (java.util.stream.IntStream)2 IOUtils (org.apache.commons.io.IOUtils)2 SentenceTokenizer (org.apache.lucene.analysis.cn.smart.SentenceTokenizer)2