use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.
the class SuggestionTester method analyze.
public static void analyze(Reader reader, Writer writer, Voikko voikko, String suggestionFile, boolean stopOnSuccess, boolean useHyphenFilter, TokenStream t) throws IOException {
List<Analysis> analysis = null;
((Tokenizer) t).setReader(reader);
// t = new VoikkoFilter (t, voikko);
t = new SuggestionFilter(t, voikko, suggestionFile, false);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
try {
t.reset();
while (t.incrementToken()) {
writer.write("Sana: " + originalWordAtt.getOriginalWord() + " | " + termAtt.toString() + " | ");
writer.write(Constants.toString(flagsAtt));
writer.write("\n");
writer.flush();
}
t.end();
} finally {
t.close();
}
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.
the class BaseFormTester method test.
public static void test(Reader reader, Writer writer, Voikko voikko, boolean successOnly) throws IOException {
TokenStream t = new HVTokenizer();
((Tokenizer) t).setReader(reader);
t = new BaseFormFilter(t, voikko, successOnly);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
BaseFormAttribute baseFormAtt = t.addAttribute(BaseFormAttribute.class);
FlagsAttribute flagsAtt = t.addAttribute(FlagsAttribute.class);
OriginalWordAttribute originalWordAtt = t.addAttribute(OriginalWordAttribute.class);
String orig = "";
TreeSet<String> tset = new TreeSet<String>();
FlagsAttribute flagsA = new FlagsAttributeImpl();
try {
t.reset();
while (t.incrementToken()) {
if (!orig.equals("") && !orig.equals(originalWordAtt.getOriginalWord())) {
writer.write("Sana: " + orig);
if (Constants.hasFlag(flagsA, Constants.FOUND)) {
writer.write(" M " + toString(tset));
}
writer.write("\n");
writer.flush();
tset.clear();
}
orig = originalWordAtt.getOriginalWord();
tset.addAll(baseFormAtt.getBaseForms());
flagsA.setFlags(flagsAtt.getFlags());
}
writer.write("Sana: " + orig);
if (Constants.hasFlag(flagsA, Constants.FOUND)) {
writer.write(" M " + toString(tset));
}
writer.write("\n");
writer.flush();
t.end();
} finally {
t.close();
}
/*
try {
t.reset();
while (t.incrementToken()) {
writer.write ("Sana: " + originalWordAtt.getOriginalWord()
+ " " + termAtt.toString()
+ " " + Constants.toString (flagsAtt)
+ " " + baseFormAtt.getBaseForms().toString()
+ "\n");
writer.flush();
}
t.end();
}
finally {
t.close();
}
*/
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project sukija by ahomansikka.
the class AppTest method test.
private boolean test(String input, String expectedOutput) throws IOException {
Reader r = new StringReader(input);
TokenStream t = new HVTokenizer();
((Tokenizer) t).setReader(r);
t = new VoikkoFilter(t, voikko);
t.reset();
VoikkoAttribute sukijaAtt = t.addAttribute(VoikkoAttribute.class);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
while (t.incrementToken()) {
System.out.println("AppTest " + termAtt.toString());
for (int i = 0; i < sukijaAtt.getAnalysis().size(); i++) {
System.out.println(sukijaAtt.getAnalysis(i).get("BASEFORM"));
// VoikkoUtils.printAnalysisResult (sukijaAtt.getAnalysis(i), System.out);
}
System.out.println("");
}
return true;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project jackrabbit-oak by apache.
the class LuceneIndex method tokenize.
/**
* Tries to merge back tokens that are split on relevant fulltext query
* wildcards ('*' or '?')
*
*
* @param text
* @param analyzer
* @return
*/
static List<String> tokenize(String text, Analyzer analyzer) {
List<String> tokens = new ArrayList<String>();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(FieldNames.FULLTEXT, new StringReader(text));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
// TypeAttribute type = stream.addAttribute(TypeAttribute.class);
stream.reset();
int poz = 0;
boolean hasFulltextToken = false;
StringBuilder token = new StringBuilder();
while (stream.incrementToken()) {
String term = termAtt.toString();
int start = offsetAtt.startOffset();
int end = offsetAtt.endOffset();
if (start > poz) {
for (int i = poz; i < start; i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
hasFulltextToken = true;
}
}
}
}
poz = end;
if (hasFulltextToken) {
token.append(term);
hasFulltextToken = false;
} else {
if (token.length() > 0) {
tokens.add(token.toString());
}
token = new StringBuilder();
token.append(term);
}
}
// consume to the end of the string
if (poz < text.length()) {
for (int i = poz; i < text.length(); i++) {
for (char c : fulltextTokens) {
if (c == text.charAt(i)) {
token.append(c);
}
}
}
}
if (token.length() > 0) {
tokens.add(token.toString());
}
stream.end();
} catch (IOException e) {
LOG.error("Building fulltext query failed", e.getMessage());
return null;
} finally {
try {
if (stream != null) {
stream.close();
}
} catch (IOException e) {
// ignore
}
}
return tokens;
}
use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.
the class TestASCIIFoldingFilter method testUnmodifiedLetters.
// Test that we do not emit duplicated tokens when preserve original is on
public void testUnmodifiedLetters() throws Exception {
TokenStream stream = whitespaceMockTokenizer("§ ¦ ¤ END");
ASCIIFoldingFilter filter = new ASCIIFoldingFilter(stream, true);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
filter.reset();
assertNextTerms("§", "§", filter, termAtt);
assertNextTerms("¦", "¦", filter, termAtt);
assertNextTerms("¤", "¤", filter, termAtt);
assertNextTerms("END", "END", filter, termAtt);
assertFalse(filter.incrementToken());
}
Aggregations