use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.
the class FstLinkingEngine method tag.
/**
* Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
* tagging results to the parsed tag map.
* @param content the content to link
* @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
* @param session the tagging session of the text
* @param corpus the corpus o the session to tag the content with
* @param tags the Tags map used to store the tagging results
* @return the time in milliseconds spent in the tag callback.
* @throws IOException on any error while accessing the {@link SolrCore}
*/
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
final TokenStream tokenStream;
final TagClusterReducer reducer;
log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
switch(linkingMode) {
case // will link all tokens and search longest dominant right
PLAIN:
tokenStream = baseTokenStream;
reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
break;
case NER:
// this uses the NamedEntityTokenFilter as tokenStream and a
// combination with the longest dominant right as reducer
NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
tokenStream = neTokenFilter;
reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
break;
case LINKABLE_TOKEN:
// this uses the LinkableTokenFilter as tokenStream
LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
// NOTE that the LinkableTokenFilter implements longest dominant right
// based on the matchable span of tags (instead of the whole span).
reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
tokenStream = linkableTokenFilter;
break;
default:
throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
}
log.debug(" - tokenStream: {}", tokenStream);
log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
// Now process the document
final long[] time = new long[] { 0 };
new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {
@Override
protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
long start = System.nanoTime();
if (log.isTraceEnabled()) {
log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
}
int[] span = new int[] { startOffset, endOffset };
Tag tag = tags.get(span);
if (tag == null) {
tag = new Tag(span);
tags.put(span, tag);
}
// below caches, and also flags matchDocIdsBS
Set<Match> matches = createMatches(docIdsKey);
if (log.isTraceEnabled()) {
log.trace(" - {} matches", matches.size());
}
tag.addIds(matches);
long dif = System.nanoTime() - start;
time[0] = time[0] + dif;
}
// NOTE: We can not use a cache, because we need to create different
// Match instances even for the same 'docIdsKey'. This is because
// the same result list might get generated for different
// surface forms in the text (e.g. if the SolrIndex is case
// insensitive, but the linking does consider the case when
// calculating the score). If we would use this cache Match
// instances would be used for several occurrences in the text
// and Match#getScore() values would get overridden when
// processing those multiple occurrences.
// Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
private Set<Match> createMatches(long docIdsKey) {
IntsRef docIds = lookupDocIds(docIdsKey);
Set<Match> matches = new HashSet<Match>(docIds.length);
for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
int docId = docIds.ints[i];
// also, flip docid in bitset
matchDocIdsBS.set(docId);
// translates here
matches.add(session.createMatch(docId));
}
return matches;
}
}.process();
return (int) (time[0] / 1000000);
}
use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.
the class SmartcnTokenizerEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
if (!at.getSentences().hasNext()) {
// no sentences ... use this engine to detect
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
// now the tokens
TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
try {
tokens.reset();
while (tokens.incrementToken()) {
OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
Token t = at.addToken(offset.startOffset(), offset.endOffset());
log.trace("detected {}", t);
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
use of org.apache.commons.io.input.CharSequenceReader in project thingsboard by thingsboard.
the class CsvUtils method parseCsv.
public static List<List<String>> parseCsv(String content, Character delimiter) throws Exception {
CSVFormat csvFormat = delimiter.equals(',') ? CSVFormat.DEFAULT : CSVFormat.DEFAULT.withDelimiter(delimiter);
List<CSVRecord> records;
try (CharSequenceReader reader = new CharSequenceReader(content)) {
records = csvFormat.parse(reader).getRecords();
}
return records.stream().map(record -> Stream.iterate(0, i -> i < record.size(), i -> i + 1).map(record::get).collect(Collectors.toList())).collect(Collectors.toList());
}
use of org.apache.commons.io.input.CharSequenceReader in project webapp by elimu-ai.
the class ApplicationOpenedEventRestController method create.
@RequestMapping("/create")
public String create(HttpServletRequest request, // TODO: checksum,
@RequestParam MultipartFile multipartFile) {
logger.info("create");
logger.info("request.getQueryString(): " + request.getQueryString());
if (!multipartFile.isEmpty()) {
try {
byte[] bytes = multipartFile.getBytes();
Reader reader = new CharSequenceReader((new String(bytes)));
List<String> lines = IOUtils.readLines(reader);
logger.info("lines.size(): " + lines.size());
reader.close();
for (String eventLine : lines) {
logger.info("eventLine: " + eventLine);
// Expected format: id:163|deviceId:2312aff4939750ea|time:1496843219926|packageName:ai.elimu.nyaqd|studentId:2312aff4939750ea_4
String deviceId = EventLineHelper.getDeviceId(eventLine);
Device device = deviceDao.read(deviceId);
logger.info("device: " + device);
Calendar timeOfEvent = EventLineHelper.getTime(eventLine);
String packageName = EventLineHelper.getPackageName(eventLine);
ApplicationOpenedEvent existingApplicationOpenedEvent = applicationOpenedEventDao.read(device, timeOfEvent, packageName);
logger.info("existingApplicationOpenedEvent: " + existingApplicationOpenedEvent);
if (existingApplicationOpenedEvent == null) {
ApplicationOpenedEvent applicationOpenedEvent = new ApplicationOpenedEvent();
applicationOpenedEvent.setDevice(device);
applicationOpenedEvent.setCalendar(timeOfEvent);
applicationOpenedEvent.setPackageName(packageName);
applicationOpenedEventDao.create(applicationOpenedEvent);
}
}
} catch (IOException ex) {
logger.error(null, ex);
}
}
JSONObject jsonObject = new JSONObject();
jsonObject.put("result", "success");
// TODO: handle error
logger.info("jsonObject: " + jsonObject);
return jsonObject.toString();
}
use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.
the class SmartcnSentenceEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
sentences.reset();
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
Aggregations