use of org.omegat.util.Token in project omegat by omegat-org.
the class BaseTokenizer method tokenizeVerbatim.
/**
* {@inheritDoc}
*/
@Override
public Token[] tokenizeVerbatim(final String strOrig) {
if (StringUtil.isEmpty(strOrig)) {
return EMPTY_TOKENS_LIST;
}
if (!shouldDelegateTokenizeExactly) {
return tokenize(strOrig, false, false, false, false);
}
List<Token> result = new ArrayList<Token>(DEFAULT_TOKENS_COUNT);
WordIterator iterator = new WordIterator();
iterator.setText(strOrig);
int start = iterator.first();
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
String tokenStr = strOrig.substring(start, end);
result.add(new Token(tokenStr, start));
}
return result.toArray(new Token[result.size()]);
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class BaseTokenizer method tokenize.
protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) {
if (StringUtil.isEmpty(strOrig)) {
return EMPTY_TOKENS_LIST;
}
List<Token> result = new ArrayList<Token>(64);
try (TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed)) {
in.addAttribute(CharTermAttribute.class);
in.addAttribute(OffsetAttribute.class);
CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
in.reset();
while (in.incrementToken()) {
String tokenText = cattr.toString();
if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
}
}
in.end();
} catch (IOException ex) {
Log.log(ex);
}
return result.toArray(new Token[result.size()]);
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class DefaultTokenizer method searchAllInexact.
/**
* Check if all elements of {@code needles} are present in {@code haystack}, and if so, return all matching elements
* of {@code haystack}.
*
* @param haystack
* a list of tokens to be searched
* @param needles
* a list of tokens to search in {@code haystack}
* @return A list of one array containing the unique hits of the matched tokens.
*/
private static List<Token[]> searchAllInexact(Token[] haystack, Token[] needles) {
List<Token> result = null;
for (Token n : needles) {
boolean found = false;
for (int i = 0; (i = search(haystack, n, i)) != -1; i++) {
if (result == null) {
result = new ArrayList<>();
}
found = true;
if (!contains(result, haystack[i])) {
result.add(haystack[i]);
}
}
if (!found) {
return Collections.emptyList();
}
}
if (result.size() < needles.length) {
return Collections.emptyList();
}
// We expect to filter results later, so we can't use Collections.singletonList here
List<Token[]> ret = new ArrayList<>();
ret.add(result.toArray(new Token[result.size()]));
return ret;
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class DefaultTokenizer method tokenizeTextNoCache.
/**
* Breaks a string into tokens.
* <p>
* Examples:
* <ul>
* <li>This is a semi-good way. -> "this", "is", "a", "semi-good", "way"
* <li>Fine, thanks, and you? -> "fine", "thanks", "and", "you"
* <li>C&all this action -> "call", "this", "action" ('&' is eaten)
* </ul>
* <p>
* OmegaT tags and other non-word tokens are skipped if the parameter "all"
* is false.
*
* @param str
* string to tokenize
* @param all
* If true, numbers, tags, and other non-word tokens are included
* in the list
* @return array of tokens (all)
*/
private static Token[] tokenizeTextNoCache(final String strOrig, final boolean all) {
if (StringUtil.isEmpty(strOrig)) {
// fixes bug nr. 1382810 (StringIndexOutOfBoundsException)
return EMPTY_TOKENS_LIST;
}
// create a new token list
List<Token> tokens = new ArrayList<Token>(64);
// get a word breaker
BreakIterator breaker = getWordBreaker();
breaker.setText(strOrig);
int start = breaker.first();
for (int end = breaker.next(); end != BreakIterator.DONE; start = end, end = breaker.next()) {
String tokenStr = strOrig.substring(start, end);
if (all) {
// Accepting all tokens
tokens.add(new Token(tokenStr, start));
continue;
}
// Accepting only words that aren't OmegaT tags
boolean word = false;
for (int cp, i = 0; i < tokenStr.length(); i += Character.charCount(cp)) {
cp = tokenStr.codePointAt(i);
if (Character.isLetter(cp)) {
word = true;
break;
}
}
if (word && !PatternConsts.OMEGAT_TAG.matcher(tokenStr).matches()) {
tokens.add(new Token(tokenStr, start));
}
}
return tokens.toArray(new Token[tokens.size()]);
}
use of org.omegat.util.Token in project omegat by omegat-org.
the class DefaultTokenizerTest method testContains.
@Test
public void testContains() {
String text = "The quick brown fox jumped over the lazy dog.";
Token[] tokensList = new DefaultTokenizer().tokenizeVerbatim(text);
for (Token tok : toTokArr(text)) {
assertTrue(DefaultTokenizer.isContains(tokensList, tok));
}
assertFalse(DefaultTokenizer.isContains(tokensList, new Token("elephant", 0)));
}
Aggregations