Search in sources :

Example 11 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class BaseTokenizer method tokenizeVerbatim.

/**
 * {@inheritDoc}
 */
@Override
public Token[] tokenizeVerbatim(final String strOrig) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }
    if (!shouldDelegateTokenizeExactly) {
        return tokenize(strOrig, false, false, false, false);
    }
    List<Token> result = new ArrayList<Token>(DEFAULT_TOKENS_COUNT);
    WordIterator iterator = new WordIterator();
    iterator.setText(strOrig);
    int start = iterator.first();
    for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
        String tokenStr = strOrig.substring(start, end);
        result.add(new Token(tokenStr, start));
    }
    return result.toArray(new Token[result.size()]);
}
Also used : ArrayList(java.util.ArrayList) Token(org.omegat.util.Token)

Example 12 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class BaseTokenizer method tokenize.

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed, final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }
    List<Token> result = new ArrayList<Token>(64);
    try (TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed)) {
        in.addAttribute(CharTermAttribute.class);
        in.addAttribute(OffsetAttribute.class);
        CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
        OffsetAttribute off = in.getAttribute(OffsetAttribute.class);
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
    } catch (IOException ex) {
        Log.log(ex);
    }
    return result.toArray(new Token[result.size()]);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.omegat.util.Token) IOException(java.io.IOException)

Example 13 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class DefaultTokenizer method searchAllInexact.

/**
 * Check if all elements of {@code needles} are present in {@code haystack}, and if so, return all matching elements
 * of {@code haystack}.
 *
 * @param haystack
 *            a list of tokens to be searched
 * @param needles
 *            a list of tokens to search in {@code haystack}
 * @return A list of one array containing the unique hits of the matched tokens.
 */
private static List<Token[]> searchAllInexact(Token[] haystack, Token[] needles) {
    List<Token> result = null;
    for (Token n : needles) {
        boolean found = false;
        for (int i = 0; (i = search(haystack, n, i)) != -1; i++) {
            if (result == null) {
                result = new ArrayList<>();
            }
            found = true;
            if (!contains(result, haystack[i])) {
                result.add(haystack[i]);
            }
        }
        if (!found) {
            return Collections.emptyList();
        }
    }
    if (result.size() < needles.length) {
        return Collections.emptyList();
    }
    // We expect to filter results later, so we can't use Collections.singletonList here
    List<Token[]> ret = new ArrayList<>();
    ret.add(result.toArray(new Token[result.size()]));
    return ret;
}
Also used : ArrayList(java.util.ArrayList) Token(org.omegat.util.Token)

Example 14 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class DefaultTokenizer method tokenizeTextNoCache.

/**
 * Breaks a string into tokens.
 * <p>
 * Examples:
 * <ul>
 * <li>This is a semi-good way. -> "this", "is", "a", "semi-good", "way"
 * <li>Fine, thanks, and you? -> "fine", "thanks", "and", "you"
 * <li>C&all this action -> "call", "this", "action" ('&' is eaten)
 * </ul>
 * <p>
 * OmegaT tags and other non-word tokens are skipped if the parameter "all"
 * is false.
 *
 * @param str
 *            string to tokenize
 * @param all
 *            If true, numbers, tags, and other non-word tokens are included
 *            in the list
 * @return array of tokens (all)
 */
private static Token[] tokenizeTextNoCache(final String strOrig, final boolean all) {
    if (StringUtil.isEmpty(strOrig)) {
        // fixes bug nr. 1382810 (StringIndexOutOfBoundsException)
        return EMPTY_TOKENS_LIST;
    }
    // create a new token list
    List<Token> tokens = new ArrayList<Token>(64);
    // get a word breaker
    BreakIterator breaker = getWordBreaker();
    breaker.setText(strOrig);
    int start = breaker.first();
    for (int end = breaker.next(); end != BreakIterator.DONE; start = end, end = breaker.next()) {
        String tokenStr = strOrig.substring(start, end);
        if (all) {
            // Accepting all tokens
            tokens.add(new Token(tokenStr, start));
            continue;
        }
        // Accepting only words that aren't OmegaT tags
        boolean word = false;
        for (int cp, i = 0; i < tokenStr.length(); i += Character.charCount(cp)) {
            cp = tokenStr.codePointAt(i);
            if (Character.isLetter(cp)) {
                word = true;
                break;
            }
        }
        if (word && !PatternConsts.OMEGAT_TAG.matcher(tokenStr).matches()) {
            tokens.add(new Token(tokenStr, start));
        }
    }
    return tokens.toArray(new Token[tokens.size()]);
}
Also used : ArrayList(java.util.ArrayList) Token(org.omegat.util.Token) BreakIterator(java.text.BreakIterator)

Example 15 with Token

use of org.omegat.util.Token in project omegat by omegat-org.

the class DefaultTokenizerTest method testContains.

@Test
public void testContains() {
    String text = "The quick brown fox jumped over the lazy dog.";
    Token[] tokensList = new DefaultTokenizer().tokenizeVerbatim(text);
    for (Token tok : toTokArr(text)) {
        assertTrue(DefaultTokenizer.isContains(tokensList, tok));
    }
    assertFalse(DefaultTokenizer.isContains(tokensList, new Token("elephant", 0)));
}
Also used : Token(org.omegat.util.Token) Test(org.junit.Test)

Aggregations

Token (org.omegat.util.Token)19 ArrayList (java.util.ArrayList)8 NearString (org.omegat.core.matching.NearString)3 List (java.util.List)2 ITokenizer (org.omegat.tokenizer.ITokenizer)2 Point (java.awt.Point)1 IOException (java.io.IOException)1 BreakIterator (java.text.BreakIterator)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 HashSet (java.util.HashSet)1 Matcher (java.util.regex.Matcher)1 JMenuItem (javax.swing.JMenuItem)1 HighlightPainter (javax.swing.text.Highlighter.HighlightPainter)1 StyledDocument (javax.swing.text.StyledDocument)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 Test (org.junit.Test)1