use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project textdb by TextDB.
the class WordCloudOpPartialExec method calculateWordCount.
private static List<Tuple> calculateWordCount(List<String> texts, Analyzer luceneAnalyzer) throws Exception {
HashMap<String, Integer> termFreqMap = new HashMap<>();
for (String text : texts) {
TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(text));
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
int charStart = offsetAttribute.startOffset();
int charEnd = offsetAttribute.endOffset();
String termStr = text.substring(charStart, charEnd).toLowerCase();
if (!EnglishAnalyzer.ENGLISH_STOP_WORDS_SET.contains(termStr))
termFreqMap.put(termStr, termFreqMap.get(termStr) == null ? 1 : termFreqMap.get(termStr) + 1);
}
tokenStream.close();
}
List<Tuple> termFreqTuples = new ArrayList<>();
for (Map.Entry<String, Integer> e : termFreqMap.entrySet()) {
termFreqTuples.add(Tuple.newBuilder(partialAggregateSchema).addSequentially(new Object[] { e.getKey(), e.getValue() }).build());
}
return termFreqTuples;
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project HongsCORE by ihongs.
the class DemoTest method main.
public static void main(String[] args) throws IOException {
Analyzer az = CustomAnalyzer.builder().withTokenizer("Name").addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20").build();
StringReader sr = new StringReader(args[0]);
TokenStream ts = az.tokenStream("", sr);
OffsetAttribute oa = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
try {
// Resets this stream to the beginning. (Required)
ts.reset();
while (ts.incrementToken()) {
System.out.println(ta.toString() + "|" + ta.length() + "[" + oa.startOffset() + "," + oa.endOffset() + "]");
}
// Perform end-of-stream operations, e.g. set the final offset.
ts.end();
} finally {
// Release resources associated with this stream.
ts.close();
}
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class DefinitionsTokenStreamTest method testDefinitionsVsContent.
// DefinitionsTokenStream should not be used in try-with-resources
@SuppressWarnings("java:S2095")
private void testDefinitionsVsContent(boolean expandTabs, String sourceResource, String tagsResource, int expectedCount, boolean doSupplement, Map<Integer, SimpleEntry<String, String>> overrides) throws IOException {
StreamSource src = getSourceFromResource(sourceResource);
// Deserialize the ctags.
int tabSize = expandTabs ? 8 : 0;
String suppResource = doSupplement ? sourceResource : null;
Definitions defs = StreamUtils.readTagsFromResource(tagsResource, suppResource, tabSize);
// Read the whole input.
StringBuilder bld = new StringBuilder();
String source;
try (Reader rdr = ExpandTabsReader.wrap(IOUtils.createBOMStrippedReader(src.getStream(), StandardCharsets.UTF_8.name()), tabSize)) {
int c;
while ((c = rdr.read()) != -1) {
bld.append((char) c);
}
source = bld.toString();
}
// Deserialize the token stream.
DefinitionsTokenStream tokstream = new DefinitionsTokenStream();
tokstream.initialize(defs, src, in -> ExpandTabsReader.wrap(in, tabSize));
// Iterate through stream.
CharTermAttribute term = tokstream.getAttribute(CharTermAttribute.class);
assertNotNull(term, "CharTermAttribute");
OffsetAttribute offs = tokstream.getAttribute(OffsetAttribute.class);
assertNotNull(offs, "OffsetAttribute");
int count = 0;
while (tokstream.incrementToken()) {
++count;
String termValue = term.toString();
String cutValue = source.substring(offs.startOffset(), offs.endOffset());
// If an override exists, test it specially.
if (overrides != null && overrides.containsKey(count)) {
SimpleEntry<String, String> overkv = overrides.get(count);
assertEquals(overkv.getKey(), cutValue, "cut term override" + count);
assertEquals(overkv.getValue(), termValue, "cut term w.r.t. term override" + count);
continue;
}
boolean cutContainsTerm = cutValue.endsWith(termValue);
assertTrue(cutContainsTerm, "cut term" + count + " at " + (offs.startOffset()) + "-" + (offs.endOffset()) + "[" + cutValue + "] vs [" + termValue + "]");
}
assertEquals(expectedCount, count, "token count");
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class JFlexTokenizerTest method testOffsetAttribute.
/**
* Helper method for {@link #testOffsetAttribute()} that runs the test on
* one single implementation class with the specified input text and
* expected tokens.
*/
private void testOffsetAttribute(Class<? extends JFlexSymbolMatcher> klass, String inputText, String[] expectedTokens) throws Exception {
JFlexSymbolMatcher matcher = klass.getConstructor(Reader.class).newInstance(new StringReader(inputText));
JFlexTokenizer tokenizer = new JFlexTokenizer(matcher);
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
while (tokenizer.incrementToken()) {
assertTrue(count < expectedTokens.length, "too many tokens");
String expected = expectedTokens[count];
assertEquals(expected, term.toString(), "term");
assertEquals(inputText.indexOf(expected), offset.startOffset(), "start");
assertEquals(inputText.indexOf(expected) + expected.length(), offset.endOffset(), "end");
count++;
}
assertEquals(expectedTokens.length, count, "wrong number of tokens");
}
use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project OpenGrok by OpenGrok.
the class CustomAssertions method assertSymbolStream.
/**
* Asserts the specified tokenizer class produces an expected stream of
* symbols from the specified input.
* @param klass the test class
* @param iss the input stream
* @param expectedTokens the expected, ordered token list
* @throws java.lang.Exception if an error occurs constructing a
* {@code klass} instance or testing the stream
*/
public static void assertSymbolStream(Class<? extends JFlexSymbolMatcher> klass, InputStream iss, List<String> expectedTokens) throws Exception {
byte[] inputCopy = copyStream(iss);
String input = new String(inputCopy, StandardCharsets.UTF_8);
JFlexTokenizer tokenizer = new JFlexTokenizer(klass.getConstructor(Reader.class).newInstance(new InputStreamReader(new ByteArrayInputStream(inputCopy), StandardCharsets.UTF_8)));
CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class);
OffsetAttribute offs = tokenizer.addAttribute(OffsetAttribute.class);
int count = 0;
List<String> tokens = new ArrayList<>();
while (tokenizer.incrementToken()) {
String termValue = term.toString();
tokens.add(termValue);
String cutValue = input.substring(offs.startOffset(), offs.endOffset());
assertEquals(cutValue, termValue, "cut term" + (1 + count));
++count;
}
count = 0;
for (String token : tokens) {
// 1-based offset to accord with line #
if (count >= expectedTokens.size()) {
printTokens(tokens);
assertTrue(count < expectedTokens.size(), "too many tokens at term" + (1 + count) + ": " + token);
}
String expected = expectedTokens.get(count);
if (!token.equals(expected)) {
printTokens(tokens);
assertEquals(expected, token, "term" + (1 + count));
}
count++;
}
if (expectedTokens.size() != count) {
printTokens(tokens);
assertEquals(expectedTokens.size(), count, "wrong number of tokens");
}
}
Aggregations