use of org.apache.commons.io.input.CharSequenceReader in project asterixdb by apache.
the class FunctionParser method getFunctionDecl.
public FunctionDecl getFunctionDecl(Function function) throws CompilationException {
String functionBody = function.getFunctionBody();
List<String> params = function.getParams();
List<VarIdentifier> varIdentifiers = new ArrayList<VarIdentifier>();
StringBuilder builder = new StringBuilder();
builder.append(" use dataverse " + function.getDataverseName() + ";");
builder.append(" declare function " + function.getName().split("@")[0]);
builder.append("(");
boolean first = true;
for (String param : params) {
VarIdentifier varId = new VarIdentifier(param);
varIdentifiers.add(varId);
if (first) {
first = false;
} else {
builder.append(",");
}
builder.append(param);
}
builder.append("){\n").append(functionBody).append("\n}");
IParser parser = parserFactory.createParser(new CharSequenceReader(builder));
List<Statement> statements = parser.parse();
FunctionDecl decl = (FunctionDecl) statements.get(1);
return decl;
}
use of org.apache.commons.io.input.CharSequenceReader in project tutorials by eugenp.
the class JavaXToReaderUnitTest method givenUsingCommonsIO_whenConvertingFileIntoReader_thenCorrect.
@Test
public void givenUsingCommonsIO_whenConvertingFileIntoReader_thenCorrect() throws IOException {
final File initialFile = new File("src/test/resources/initialFile.txt");
FileUtils.touch(initialFile);
FileUtils.write(initialFile, "With Commons IO");
final byte[] buffer = FileUtils.readFileToByteArray(initialFile);
final Reader targetReader = new CharSequenceReader(new String(buffer));
targetReader.close();
}
use of org.apache.commons.io.input.CharSequenceReader in project tutorials by eugenp.
the class JavaXToReaderUnitTest method givenUsingCommonsIO_whenConvertingByteArrayIntoReader_thenCorrect.
@Test
public void givenUsingCommonsIO_whenConvertingByteArrayIntoReader_thenCorrect() throws IOException {
final byte[] initialArray = "With Commons IO".getBytes();
final Reader targetReader = new CharSequenceReader(new String(initialArray));
targetReader.close();
}
use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.
the class FstLinkingEngine method tag.
/**
* Uses the {@link Corpus} to tag the the {@link AnalysedText} and adds
* tagging results to the parsed tag map.
* @param content the content to link
* @param at the AnalyzedText. not required if {@link LinkingModeEnum#PLAIN}
* @param session the tagging session of the text
* @param corpus the corpus o the session to tag the content with
* @param tags the Tags map used to store the tagging results
* @return the time in milliseconds spent in the tag callback.
* @throws IOException on any error while accessing the {@link SolrCore}
*/
private int tag(final String content, final AnalysedText at, final TaggingSession session, final Corpus corpus, final Map<int[], Tag> tags) throws IOException {
final OpenBitSet matchDocIdsBS = new OpenBitSet(session.getSearcher().maxDoc());
TokenStream baseTokenStream = corpus.getTaggingAnalyzer().tokenStream("", new CharSequenceReader(content));
final TokenStream tokenStream;
final TagClusterReducer reducer;
log.debug(" ... set up TokenStream and TagClusterReducer for linking mode {}", linkingMode);
switch(linkingMode) {
case // will link all tokens and search longest dominant right
PLAIN:
tokenStream = baseTokenStream;
reducer = TagClusterReducer.LONGEST_DOMINANT_RIGHT;
break;
case NER:
// this uses the NamedEntityTokenFilter as tokenStream and a
// combination with the longest dominant right as reducer
NamedEntityTokenFilter neTokenFilter = new NamedEntityTokenFilter(baseTokenStream, at, session.getLanguage(), neTypeMappings.keySet(), session.entityMentionTypes);
tokenStream = neTokenFilter;
reducer = new ChainedTagClusterReducer(neTokenFilter, TagClusterReducer.LONGEST_DOMINANT_RIGHT);
break;
case LINKABLE_TOKEN:
// this uses the LinkableTokenFilter as tokenStream
LinkableTokenFilter linkableTokenFilter = new LinkableTokenFilter(baseTokenStream, at, session.getLanguage(), tpConfig.getConfiguration(session.getLanguage()), elConfig.getMinChunkMatchScore(), elConfig.getMinFoundTokens());
// NOTE that the LinkableTokenFilter implements longest dominant right
// based on the matchable span of tags (instead of the whole span).
reducer = new ChainedTagClusterReducer(linkableTokenFilter, TagClusterReducer.ALL);
tokenStream = linkableTokenFilter;
break;
default:
throw new IllegalStateException("Unrecognized LinkingMode '" + linkingMode + "! Please adapt implementation to changed Enumeration!");
}
log.debug(" - tokenStream: {}", tokenStream);
log.debug(" - reducer: {} (class: {})", reducer, reducer.getClass().getName());
// Now process the document
final long[] time = new long[] { 0 };
new Tagger(corpus.getFst(), tokenStream, reducer, session.isSkipAltTokens()) {
@Override
protected void tagCallback(int startOffset, int endOffset, long docIdsKey) {
long start = System.nanoTime();
if (log.isTraceEnabled()) {
log.trace(" > tagCallback for {}", content.subSequence(startOffset, endOffset));
}
int[] span = new int[] { startOffset, endOffset };
Tag tag = tags.get(span);
if (tag == null) {
tag = new Tag(span);
tags.put(span, tag);
}
// below caches, and also flags matchDocIdsBS
Set<Match> matches = createMatches(docIdsKey);
if (log.isTraceEnabled()) {
log.trace(" - {} matches", matches.size());
}
tag.addIds(matches);
long dif = System.nanoTime() - start;
time[0] = time[0] + dif;
}
// NOTE: We can not use a cache, because we need to create different
// Match instances even for the same 'docIdsKey'. This is because
// the same result list might get generated for different
// surface forms in the text (e.g. if the SolrIndex is case
// insensitive, but the linking does consider the case when
// calculating the score). If we would use this cache Match
// instances would be used for several occurrences in the text
// and Match#getScore() values would get overridden when
// processing those multiple occurrences.
// Map<Long,Set<Match>> docIdsListCache = new HashMap<Long,Set<Match>>(1024);
private Set<Match> createMatches(long docIdsKey) {
IntsRef docIds = lookupDocIds(docIdsKey);
Set<Match> matches = new HashSet<Match>(docIds.length);
for (int i = docIds.offset; i < docIds.offset + docIds.length; i++) {
int docId = docIds.ints[i];
// also, flip docid in bitset
matchDocIdsBS.set(docId);
// translates here
matches.add(session.createMatch(docId));
}
return matches;
}
}.process();
return (int) (time[0] / 1000000);
}
use of org.apache.commons.io.input.CharSequenceReader in project stanbol by apache.
the class SmartcnTokenizerEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("zh".equals(language) || (language != null && language.startsWith("zh-")))) {
throw new IllegalStateException("The detected language is NOT 'zh'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
if (!at.getSentences().hasNext()) {
// no sentences ... use this engine to detect
// first the sentences
TokenStream sentences = new SentenceTokenizer(new CharSequenceReader(at.getText()));
try {
while (sentences.incrementToken()) {
OffsetAttribute offset = sentences.addAttribute(OffsetAttribute.class);
Sentence s = at.addSentence(offset.startOffset(), offset.endOffset());
if (log.isTraceEnabled()) {
log.trace("detected {}:{}", s, s.getSpan());
}
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
// now the tokens
TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
try {
tokens.reset();
while (tokens.incrementToken()) {
OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
Token t = at.addToken(offset.startOffset(), offset.endOffset());
log.trace("detected {}", t);
}
} catch (IOException e) {
String message = String.format("IOException while reading from " + "CharSequenceReader of AnalyzedText for ContentItem %s", ci.getUri());
log.error(message, e);
throw new EngineException(this, ci, message, e);
}
}
Aggregations